add hooks CI and update README (#17)

* add hooks CI and update README * remove requirements files
Angryrou · Dec 29, 2023 · 1890453 · 1890453
1 parent cf96f4b
commit 1890453
Show file tree

Hide file tree

Showing 14 changed files with 192 additions and 82 deletions.
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -0,0 +1,28 @@
+<!--- Provide a general summary of your changes in the Title above -->
+
+## Description
+<!--- Describe your changes in detail -->
+
+## Motivation and Context
+<!--- Why is this change required? What problem does it solve? -->
+<!--- If it fixes an open issue, please link to the issue here. -->
+
+## How Has This Been Tested?
+<!--- Please describe in detail how you tested your changes. -->
+<!--- Include details of your testing environment, the tests you ran to -->
+<!--- see how your change affects other areas of the code, etc. -->
+
+## Screenshots (if appropriate):
+
+## Types of Changes
+<!--- What types of changes does your code introduce? Put an `x` in all the boxes that apply: -->
+- [ ] Bug fix (non-breaking change that fixes an issue)
+- [ ] New feature (non-breaking change that adds functionality)
+- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
+
+## Checklist:
+<!--- Go over all the following points, and put an `x` in all the boxes that apply. -->
+<!--- If you're unsure about any of these, don't hesitate to ask. We're here to help! -->
+- [ ] My code follows the code style of this project.
+- [ ] My change requires a change to the documentation.
+- [ ] I have updated the documentation accordingly.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -0,0 +1,52 @@
+name: CI
+
+on:
+  pull_request:
+    branches:
+      - "*"
+  push:
+    branches:
+      - "main"
+      - "dev"
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: Set up Python 3.9
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.9"
+          cache: "pip"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[dev]
+
+      - name: Run Pytest
+        run: |
+          pytest udao
+  hooks:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: Set up Python 3.9
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.9"
+          cache: "pip"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[dev]
+
+
+      - name: Run pre-commit hooks
+        run: pre-commit run --all-files
diff --git a/.github/workflows/documentation.yaml b/.github/workflows/documentation.yaml
@@ -32,7 +32,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install -r requirements-cpu.txt
+          pip install .[dev]
           pip install sphinx sphinx_book_theme myst_parser
 
       - name: Sphinx build

diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,73 @@
+# Contributing to `udao``
+
+Welcome to our project! We appreciate your interest in contributing to `udao`.
+
+## Types of Contributions
+
+You can contribute to `udao` in many ways. Here are some examples:
+
+- Signaling issues.
+- Fixing typos and grammatical errors.
+- Improving the documentation.
+- Adding new features.
+- Fixing bugs.
+
+## Installing the Project for Development
+
+You can install the project for development by running the following command:
+
+```
+pip install -e .[dev]
+```
+
+## Pre-commit Hooks
+
+You can install pre-commit hooks by running the following command:
+
+```
+pre-commit install
+```
+
+Pre-commit hooks will then be run at each commit.
+You can also run the pre-commit hooks manually by running the following command:
+
+```
+pre-commit run --all-files
+```
+
+## Code Style
+
+- We use [black](https://pypi.org/project/black/) for formatting our code.
+- We use [mypy](https://mypy.readthedocs.io/en/stable/) for type checking.
+
+## Documentation
+
+We use [Sphinx](https://www.sphinx-doc.org/en/master/) for documentation.
+The documentation is hosted on github pages.
+
+To build the documentation locally, run the following command:
+
+```
+cd docs
+make html
+```
+
+## Running Tests
+
+We use [pytest](https://docs.pytest.org/en/stable/) for testing.
+To run the tests, run the following command:
+
+```
+pytest udao
+```
+
+## Submitting a Pull Request
+
+- Ensure your code passes all CI checks (pre-commit hooks, tests and documentation build)
+- Submit your PR with a detailed description.
+
+## Questions or Need Help?
+
+- Contact us at chenghao@cs.umass.edu
+
+Thank you for contributing to udao!
diff --git a/README.md b/README.md
@@ -1,5 +1,9 @@
 # UDAO
-the Unified Data Analytics Optimizer (UDAO) package enables the optimization of data analytics pipelines.
+This repository is the home of the UDAO library - a next-generation unified data analytics optimizer.
+
+References:
+- [Spark-based Cloud Data Analytics using Multi-Objective Optimization](https://ieeexplore.ieee.org/document/9458826/)
+- [UDAO: a next-generation unified data analytics optimizer](https://dl.acm.org/doi/10.14778/3352063.3352103)
 
 ## Getting Started
 
@@ -11,11 +15,11 @@ Using pip:
 pip install udao
 ```
 
-## Install on GPU
+### Install on GPU
 
 The current GPU version relies on CUDA 11.8 and PyTorch 2.0.1. The following instructions are for installing the GPU version of UDAO.
 
-### Requirements
+#### Requirements
 
 Before installing, please make sure you have the following dependencies installed (using pip):
 
@@ -27,3 +31,18 @@ pip install torchaudio==2.0.2 -f https://download.pytorch.org/whl/cu118
 pip install dglgo==0.0.2
 pip install dgl -f https://data.dgl.ai/wheels/cu118/repo.html
 ```
+
+### Documentation
+You can find the documentation on our (GitHub Pages)[https://angryrou.github.io/udao/]
+
+## Limitations
+
+Some known limitations:
+1. Limitations for pandas.DataFrame to work with a very large dataset.
+2. Categorical variables are always enumerated in MOGD.
+3. Preprocessed data are not cached for reusing in hyper-parameter tuning
+
+## Contributing
+
+We welcome contributions!
+You can go to [CONTRIBUTING.md](CONTRIBUTING.md) for more information.
diff --git a/pyproject.toml b/pyproject.toml
@@ -62,11 +62,12 @@ dev = [
   "pre-commit==3.4.0",
   "types-requests==2.31.0.10",
   "requests==2.31.0",
-]
-test = [
   "pytest==7.4.2",
   "pytest-cov==4.1.0",
   "pytest-mock==3.12.0",
+  "chardet==5.2.0",
+  "sphinx==7.2.6",
+  "sphinx-book-theme==1.1.0",
 ]
 
 # List URLs that are relevant to your project

diff --git a/requirements-cpu.txt b/requirements-cpu.txt
diff --git a/requirements-gpu.txt b/requirements-gpu.txt
diff --git a/requirements.txt b/requirements.txt
diff --git a/udao/data/tests/extractors/test_query_structure_extractor.py b/udao/data/tests/extractors/test_query_structure_extractor.py
@@ -70,10 +70,13 @@ def test_extract_structure_from_df_returns_correct_shape(
             ],
             names=["plan_id", "operation_id"],
         )
-        assert structure_container.graph_meta_features is not None
-        assert structure_container.graph_meta_features.shape == (len(df_fixture), 2)
+        assert (
+            graph_meta_features := structure_container.graph_meta_features
+        ) is not None
+        assert graph_meta_features.shape == (len(df_fixture), 2)
+
         np.testing.assert_array_equal(
-            structure_container.graph_meta_features.columns, ["rows_count", "size"]
+            graph_meta_features.columns, ["rows_count", "size"]
         )
         assert (multi_index == structure_container.graph_features.index).all()
 
@@ -99,14 +102,17 @@ def test_extract_structure_from_df_returns_correct_values(
                 row.id, row.plan, "val"
             )
             for feature in ["rows_count", "size"]:
+                assert (
+                    graph_meta_features := structure_container.graph_meta_features
+                ) is not None
                 np.testing.assert_allclose(
                     structure_container.graph_features.loc[row.id][feature].values,
                     features_dict[feature],
                     rtol=1e-6,
                 )
                 assert structure_container.graph_meta_features is not None
                 np.testing.assert_allclose(
-                    structure_container.graph_meta_features.loc[row.id][feature],
+                    graph_meta_features.loc[row.id][feature],
                     features_dict[f"meta_{feature}"],
                     rtol=1e-6,
                 )

diff --git a/udao/data/tests/iterators/test_query_graph_iterator.py b/udao/data/tests/iterators/test_query_graph_iterator.py
@@ -92,6 +92,7 @@ def test_get_item(self, sample_iterator: QueryPlanIterator) -> None:
 
     def test_get_graph(self, sample_iterator: QueryPlanIterator) -> None:
         graph, meta = sample_iterator._get_graph_and_meta("a")
+        assert meta is not None
         assert th.equal(meta, th.tensor([0, 10], dtype=th.float32))
         assert isinstance(graph, dgl.DGLGraph)
         assert graph.number_of_nodes() == 2

diff --git a/udao/data/utils/utils.py b/udao/data/utils/utils.py
@@ -14,7 +14,7 @@ def train_test_val_split_on_column(
     *,
     val_frac: float,
     test_frac: float,
-    random_state: Optional[int] = None
+    random_state: Optional[int] = None,
 ) -> Dict[DatasetType, pd.DataFrame]:
     """return a dictionary of DatasetType (train/val/test) and the DataFrame"""
     train_df, non_train_df = train_test_split(

diff --git a/udao/model/embedders/base_embedder.py b/udao/model/embedders/base_embedder.py
@@ -3,6 +3,7 @@
 from typing import Any
 
 from torch import nn
+
 from udao.utils.interfaces import UdaoEmbedItemShape
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,6 +3,7 @@
		from typing import Any

		from torch import nn

		from udao.utils.interfaces import UdaoEmbedItemShape


Expand Down