Skip to content

Commit

Permalink
[integ-test] rotate instance type for some integration tests
Browse files Browse the repository at this point in the history
1. Unlike rotating OS, instance types are region dependent. Therefore, we cannot use general Jinja variables like `{{ OS_X86_1 }}`. We need to use region specific Jinja variables like `{{ US_EAST_1_INSTANCE_TYPE_0 }}`
2. For code efficiency, this commit only populates three large AWS regions. The code is extendable if more regions should be added.
3. This commit rotates instance types only on `test_essential_features` and `test_cluster_with_gpu_health_checks`. The code is extendable if more tests should be added.
4. Improve `test_cluster_with_gpu_health_checks` to be able to run on both x86 and arm

Signed-off-by: Hanwen <hanwenli@amazon.com>
  • Loading branch information
hanwen-cluster committed Jan 23, 2025
1 parent 704b811 commit e9dc0f0
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 15 deletions.
8 changes: 4 additions & 4 deletions tests/integration-tests/configs/develop.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ test-suites:
basic:
test_essential_features.py::test_essential_features:
dimensions:
- regions: ["af-south-1"]
instances: {{ common.INSTANCES_DEFAULT_X86 }}
- regions: [{{ US_EAST_1_INSTANCE_TYPE_0_AZ }}]
instances: [{{ US_EAST_1_INSTANCE_TYPE_0 }}.xlarge]
oss: [{{ OS_X86_1 }}]
schedulers: ["slurm"]
capacity_reservations:
Expand Down Expand Up @@ -288,8 +288,8 @@ test-suites:
health_checks:
test_gpu_health_checks.py::test_cluster_with_gpu_health_checks:
dimensions:
- regions: ["eu-west-1"]
instances: {{ common.INSTANCES_DEFAULT_X86 }}
- regions: [{{ EU_WEST_1_GPU_INSTANCE_TYPE_0_AZ }}]
instances: [{{ EU_WEST_1_GPU_INSTANCE_TYPE_0 }}.xlarge]
oss: [{{ OS_X86_5 }}]
schedulers: ["slurm"]
iam:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import os
from datetime import date

import boto3
import yaml
from jinja2 import FileSystemLoader
from jinja2.sandbox import SandboxedEnvironment
Expand Down Expand Up @@ -60,6 +61,94 @@ def _get_os_parameters(config=None, args=None):
return result


def _get_instance_type_parameters(): # noqa: C901
"""Gets Instance jinja parameters."""
result = {}
excluded_instance_type_prefixes = [
"m1",
"m2",
"m3",
"m4",
"t1",
"t2",
"c1",
"c3",
"c4",
"r3",
"r4",
"x1",
"x1e",
"d2",
"h1",
"i2",
"i3",
"f1",
"g3",
"p2",
"p3",
]
for region in ["us-east-1", "us-west-2", "eu-west-1"]: # Only populate instance type for big regions
ec2_client = boto3.client("ec2", region_name=region)
# The following conversion is required becase Python jinja doesn't like "-"
region_jinja = region.replace("-", "_").upper()
try:
xlarge_instances = []
instance_type_availability_zones = {}
# Use describe_instance_types with pagination
paginator = ec2_client.get_paginator("describe_instance_type_offerings")

for page in paginator.paginate(LocationType="availability-zone"):
for instance_type in page["InstanceTypeOfferings"]:
# Check if instance type ends with '.xlarge'
if instance_type["InstanceType"].endswith(".xlarge") and not any(
instance_type["InstanceType"].startswith(prefix) for prefix in excluded_instance_type_prefixes
):
xlarge_instances.append(instance_type["InstanceType"])
if instance_type_availability_zones.get(instance_type["InstanceType"]):
instance_type_availability_zones[instance_type["InstanceType"]].append(
instance_type["Location"]
)
else:
instance_type_availability_zones[instance_type["InstanceType"]] = [
instance_type["Location"]
]

xlarge_instances = list(set(xlarge_instances)) # Remove redundancy.
gpu_instances = []
paginator = ec2_client.get_paginator("describe_instance_types")
for page in paginator.paginate(InstanceTypes=xlarge_instances):
for instance_type in page["InstanceTypes"]:
if instance_type.get("GpuInfo"):
gpu_instances.append(instance_type["InstanceType"])

xlarge_instances.sort()
gpu_instances.sort()
today_number = (date.today() - date(2020, 1, 1)).days
for index in range(len(xlarge_instances)):
instance_type = xlarge_instances[(today_number + index) % len(xlarge_instances)]
result[f"{region_jinja}_INSTANCE_TYPE_{index}"] = instance_type[: -len(".xlarge")]
availability_zones = instance_type_availability_zones[instance_type]
result[f"{region_jinja}_INSTANCE_TYPE_{index}_AZ"] = (
availability_zones[0] if len(availability_zones) <= 2 else region
)
for index in range(len(gpu_instances)):
instance_type = gpu_instances[(today_number + index) % len(gpu_instances)]
result[f"{region_jinja}_GPU_INSTANCE_TYPE_{index}"] = instance_type[: -len(".xlarge")]
availability_zones = instance_type_availability_zones[instance_type]
result[f"{region_jinja}_GPU_INSTANCE_TYPE_{index}_AZ"] = (
availability_zones[0] if len(availability_zones) <= 2 else region
)
except Exception as e:
print(f"Error getting instance types: {str(e)}. Using c5 and g4dn as the default instance type")
for index in range(100):
result[f"{region_jinja}_INSTANCE_TYPE_{index}"] = "c5"
result[f"{region_jinja}_INSTANCE_TYPE_{index}_AZ"] = region
for index in range(10):
result[f"{region_jinja}_GPU_INSTANCE_TYPE_{index}"] = "g4dn"
result[f"{region_jinja}_GPU_INSTANCE_TYPE_{index}_AZ"] = region
return result


def _get_available_amis_oss(architecture, args=None, config=None):
"""
Gets available AMIs for given architecture from input.
Expand Down Expand Up @@ -97,7 +186,9 @@ def read_config_file(config_file, print_rendered=False, config=None, args=None,
:return: a dict containig the parsed config file
"""
logging.info("Parsing config file: %s", config_file)
rendered_config = _render_config_file(config_file, **kwargs, **_get_os_parameters(config=config, args=args))
rendered_config = _render_config_file(
config_file, **kwargs, **_get_os_parameters(config=config, args=args), **_get_instance_type_parameters()
)
try:
return yaml.safe_load(rendered_config)
except Exception:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,11 @@ def test_cluster_with_gpu_health_checks(
),
},
}
cluster_config = pcluster_config_reader()
if architecture == "x86_64":
non_gpu_instance = "c5.xlarge"
else:
non_gpu_instance = "m6g.xlarge"
cluster_config = pcluster_config_reader(non_gpu_instance=non_gpu_instance)
cluster = clusters_factory(cluster_config)
assert_head_node_is_running(region, cluster)
remote_command_executor = RemoteCommandExecutor(cluster)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Image:
Os: {{ os }}
HeadNode:
InstanceType: {{ instance }}
InstanceType: {{ non_gpu_instance }}
Networking:
SubnetId: {{ public_subnet_id }}
Ssh:
Expand All @@ -18,49 +18,49 @@ Scheduling:
ComputeResources:
- Name: compute-resource-1
Instances:
- InstanceType: g4dn.xlarge
- InstanceType: {{ instance }}
HealthChecks:
Gpu:
Enabled: false
- Name: compute-resource-2
Instances:
- InstanceType: g4dn.xlarge
- InstanceType: {{ instance }}
HealthChecks:
Gpu:
Enabled: true
- Name: compute-resource-3
Instances:
- InstanceType: g4dn.xlarge
- InstanceType: {{ instance }}
MinCount: 1
- Name: compute-resource-4
Instances:
- InstanceType: c5.xlarge
- InstanceType: {{ non_gpu_instance }}
HealthChecks:
Gpu:
Enabled: false
- Name: compute-resource-5
Instances:
- InstanceType: c5.xlarge
- InstanceType: {{ non_gpu_instance }}
HealthChecks:
Gpu:
Enabled: true
- Name: compute-resource-6
Instances:
- InstanceType: c5.xlarge
- InstanceType: {{ non_gpu_instance }}
Networking:
SubnetIds:
- {{ private_subnet_id }}
- Name: queue-2
ComputeResources:
- Name: compute-resource-1
Instances:
- InstanceType: g4dn.xlarge
- InstanceType: {{ instance }}
HealthChecks:
Gpu:
Enabled: true
- Name: compute-resource-2
Instances:
- InstanceType: c5.xlarge
- InstanceType: {{ non_gpu_instance }}
HealthChecks:
Gpu:
Enabled: true
Expand Down

0 comments on commit e9dc0f0

Please sign in to comment.