Support accelerate training and add test configs for SmolVLA

- 2-GPU SLURM job (distributed training)
- 1-GPU local accelerate and direct training scripts
- Accelerate configs for 1-GPU and 2-GPU setups
This commit is contained in:
danaaubakirova
2025-09-04 13:07:25 +00:00
parent 882c80d446
commit d148279921
8 changed files with 321 additions and 34 deletions
+11
View File
@@ -0,0 +1,11 @@
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: NO
downcast_bf16: 'no'
enable_cpu_affinity: false
machine_rank: 0
main_training_function: main
mixed_precision: 'no'
num_machines: 1
num_processes: 1
use_cpu: false
+18
View File
@@ -0,0 +1,18 @@
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: MULTI_GPU
downcast_bf16: 'no'
enable_cpu_affinity: false
gpu_ids: all
machine_rank: 0
main_training_function: main
mixed_precision: 'no'
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
dynamo_backend: "no"