Skip to content

Commit

Permalink
Merge pull request #37 from sb-ai-lab/feature/gpu-support
Browse files Browse the repository at this point in the history
Add GPU support and fix logging
  • Loading branch information
zakharova-anastasiia authored Jun 6, 2024
2 parents 84e97f0 + 97fd9ae commit 7b8172e
Show file tree
Hide file tree
Showing 49 changed files with 1,204 additions and 562 deletions.
2 changes: 1 addition & 1 deletion docker/grpc-base-cpu.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,5 @@ RUN poetry install --only-root
WORKDIR /opt/stalactite
ENV GIT_PYTHON_REFRESH="quiet"
LABEL framework="stalactite"

COPY ./plugins /opt/plugins
# docker build -f ./docker/grpc-base.dockerfile -t grpc-base:latest .
2 changes: 1 addition & 1 deletion docker/grpc-base.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,5 @@ WORKDIR /opt/stalactite
ENV GIT_PYTHON_REFRESH="quiet"
ENV CUDA_DEVICE_ORDER="PCI_BUS_ID"
LABEL framework="stalactite"

COPY ./plugins /opt/plugins
# docker build -f ./docker/grpc-base.dockerfile -t grpc-base:latest .
1 change: 1 addition & 0 deletions docs/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ And to launch distributed multinode or multiprocess experiment go to :ref:`distr
tutorials/distr_communicator_tutorial
tutorials/inference_tutorial
tutorials/batching_tutorial
tutorials/plugins
tutorials/master_types


Expand Down
40 changes: 40 additions & 0 deletions docs/tutorials/plugins.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
.. _plugins_tutorial:

*how-to:* Implement your own ML-algorithm (plugin)
====================================================

In the :ref:`master_types`, the implemented algorithms are listed. If you want to incorporate your own
logic into the framework, you should write the agents classes furnished with the specifications on your algorithm.
For the framework to find the plugins you write, you should create (or use existing) folder `plugins`
alongside the sourcecode of the `Stalactite`.



.. code-block:: bash
|-- ...
|-- plugins
|-- stalactite
`-- ...
In the `plugins` folder, create a folder containing your agents. The name of this folder does not matter, but
it is important for the agent implementation discovery to name your files correctly:
- the master class implementation should be placed in a file named: `party_master.py`
- the member class implementation should be placed in a file named: `party_member.py`
- the arbiter (if implemented) should be placed in a file named: `party_arbiter.py`
We have copied the honest logistic regression implementation into the repository `plugins` folder for you to see as the example.
At runtime, to use the plugin in the experiment, the configuration file must be adjusted accordingly. For example, to make the framework use
the honest logistic regression implementation from the plugins folder, you should change the ``vfl_model.vfl_model_name``
to the path from `plugins` to your directory with agents' files.
.. code-block:: yaml
vfl_model:
vfl_model_name: plugins.logistic_regression
After performing the aforementioned steps, the framework should be able to discover implemented agents and will use them
in the experiment.
64 changes: 64 additions & 0 deletions examples/configs/efficientnet-splitNN-mnist-multiprocess.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
common:
report_train_metrics_iteration: 1
report_test_metrics_iteration: 1
world_size: 2
experiment_label: experiment-efficientnet-mnist-local
reports_export_folder: "../../reports"
seed: 22

vfl_model:
epochs: 2
batch_size: 250
eval_batch_size: 9000
vfl_model_name: efficientnet
is_consequently: False
use_class_weights: True
learning_rate: 0.01
do_train: True
do_predict: False
do_save_model: True
vfl_model_path: ../../saved_models/efficientnet_model

prerequisites:
mlflow_host: 'node3.bdcl'
mlflow_port: '5555'

master:
external_host: 'node3.bdcl'
run_mlflow: True
master_model_params: {
input_dim: 128,
dropout: 0.2,
num_classes: 10,
}
run_prometheus: False
port: "50051"
logging_level: 'debug'
disconnect_idle_client_time: 500.
recv_timeout: 3600.
cuda_visible_devices: "0"

member:
member_model_params: {
width_mult: 0.1,
depth_mult: 0.1,
}
heartbeat_interval: 2.
logging_level: 'info'
recv_timeout: 3600.

data:
dataset_size: 750
dataset: 'mnist'
host_path_data_dir: ../../data/sber_ds_vfl/mnist_efficientnet_multiclass
dataset_part_prefix: 'part_' # used in dataset folder structure inspection. Concatenated with the index of a party: 0,1,... etc.
train_split: "train_train" # name of the train split
test_split: "train_val" # name of the test split
features_key: "image_part_"
label_key: "label"
uids_key: "image_idx"

docker:
docker_compose_command: "docker compose"
docker_compose_path: '../../prerequisites'
use_gpu: True
72 changes: 32 additions & 40 deletions examples/configs/linreg-mnist-multiprocess.yml
Original file line number Diff line number Diff line change
@@ -1,66 +1,58 @@
common:
experiment_label: experiment-vm
reports_export_folder: "../../reports"
report_train_metrics_iteration: 1
report_test_metrics_iteration: 1
world_size: 2
experiment_label: test-experiment-mnist-local
reports_export_folder: ../../reports
seed: 22

vfl_model:
epochs: 5
batch_size: 1000
epochs: 2
batch_size: 5000
eval_batch_size: 200
vfl_model_name: linreg
is_consequently: False
use_class_weights: False
learning_rate: 0.01

prerequisites:
mlflow_host: 'node3.bdcl'
mlflow_port: '5555'
prometheus_host: 'node3.bdcl'
prometheus_port: '9090'
grafana_port: '3001'
learning_rate: 0.2
do_train: True
do_predict: True
do_save_model: True
vfl_model_path: ../../saved_models/linreg_model

data:
dataset_size: 1000
host_path_data_dir: ../../data/sber_ds_vfl/mnist_binary38_parts2
dataset_size: 5000
dataset: 'mnist'
dataset_part_prefix: 'part_'
train_split: "train_train"
test_split: "train_val"
host_path_data_dir: ../../data/sber_ds_vfl/mnist_vfl_parts2
dataset_part_prefix: 'part_' # used in dataset folder structure inspection. Concatenated with the index of a party: 0,1,... etc.
train_split: "train_train" # name of the train split
test_split: "train_val" # name of the test split
features_key: "image_part_"
label_key: "label"
uids_key: "image_idx"

grpc_server:
port: '50051'
max_message_size: -1
# server_threadpool_max_workers: 10

prerequisites:
mlflow_host: 'node3.bdcl'
mlflow_port: '5555'

master:
external_host: 'node3.bdcl'
run_mlflow: True
run_prometheus: True
run_prometheus: False
port: "50051"
logging_level: 'debug'
disconnect_idle_client_time: 120.
# time_between_idle_connections_checks: 3
# recv_timeout: 360
disconnect_idle_client_time: 500.
recv_timeout: 3600.
cuda_visible_devices: "0"

member:
logging_level: 'debug'
member_model_params: {
output_dim: 1,
reg_lambda: 0.5
}
heartbeat_interval: 2.
# heartbeat_interval: 2
# sent_task_timout: 3600
logging_level: 'info'
recv_timeout: 3600.

docker:
docker_compose_command: "docker compose"
docker_compose_path: '../../prerequisites'
use_gpu: False


#grpc_arbiter:
# use_arbiter: False





use_gpu: True
62 changes: 62 additions & 0 deletions examples/configs/mlp-splitNN-sbol-smm-multiprocess.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
common:
report_train_metrics_iteration: 10
report_test_metrics_iteration: 10
world_size: 2
experiment_label: experiment-mlp-sbol-smm-local
reports_export_folder: "../../reports"
seed: 22

vfl_model:
epochs: 2
batch_size: 250
eval_batch_size: 200
vfl_model_name: mlp
is_consequently: False
use_class_weights: True
learning_rate: 0.01
do_train: True
do_predict: False
do_save_model: True
vfl_model_path: ../../saved_models/mlp_model

prerequisites:
mlflow_host: 'node3.bdcl'
mlflow_port: '5555'

master:
external_host: 'node3.bdcl'
run_mlflow: True
master_model_params: {
input_dim: 100,
output_dim: 19,
multilabel: True,
}
run_prometheus: False
port: "50051"
logging_level: 'debug'
disconnect_idle_client_time: 500.
recv_timeout: 3600.
cuda_visible_devices: "0"

member:
member_model_params: {
hidden_channels:[1000, 300, 100],
}
heartbeat_interval: 2.
logging_level: 'info'
recv_timeout: 3600.

data:
dataset_size: 10000
dataset: 'sbol_smm'
host_path_data_dir: ../../data/sber_ds_vfl/multilabel_sber_sample10000_smm_parts2
dataset_part_prefix: 'part_'
train_split: "train_train"
test_split: "train_val"
features_key: "features_part_"
label_key: "labels"

docker:
docker_compose_command: "docker compose"
docker_compose_path: '../../prerequisites'
use_gpu: True
61 changes: 61 additions & 0 deletions examples/configs/resnet-splitNN-sbol-smm-multiprocess.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
common:
report_train_metrics_iteration: 1
report_test_metrics_iteration: 1
world_size: 2
experiment_label: experiment-resnet-sbol-smm-local
reports_export_folder: "../../reports"

vfl_model:
epochs: 2
batch_size: 2500
eval_batch_size: 2000
vfl_model_name: resnet
is_consequently: False
use_class_weights: False
learning_rate: 0.01
do_train: True
do_predict: False
do_save_model: True
vfl_model_path: ../../saved_models/resnet_model

prerequisites:
mlflow_host: 'node3.bdcl'
mlflow_port: '5555'

master:
external_host: 'node3.bdcl'
run_mlflow: True
master_model_params: {
input_dim: 1356,
output_dim: 19,
use_bn: True,
}
run_prometheus: False
port: "50051"
logging_level: 'debug'
disconnect_idle_client_time: 500.
recv_timeout: 3600.
cuda_visible_devices: "0"

member:
member_model_params: {
hid_factor: [ 1, 1 ],
}
heartbeat_interval: 2.
logging_level: 'info'
recv_timeout: 3600.

data:
dataset_size: 10000
dataset: 'sbol_smm'
host_path_data_dir: ../../data/sber_ds_vfl/multilabel_sber_sample10000_smm_parts2
dataset_part_prefix: 'part_'
train_split: "train_train"
test_split: "train_val"
features_key: "features_part_"
label_key: "labels"

docker:
docker_compose_command: "docker compose"
docker_compose_path: '../../prerequisites'
use_gpu: True
Empty file.
Loading

0 comments on commit 7b8172e

Please sign in to comment.