chore: Regenerate all playbooks

2026-06-18 04:22:21 +00:00 · 2025-11-20 18:30:59 +00:00 · 2025-11-20 18:30:59 +00:00 · a8a81dd1a1
commit a8a81dd1a1
parent 8f8e2e6f5d
12 changed files with 511 additions and 21 deletions
--- a/nvidia/pytorch-fine-tune/assets/Llama3_3B_full_finetuning.py
+++ b/nvidia/pytorch-fine-tune/assets/Llama3_3B_full_finetuning.py
@ -22,6 +22,7 @@ from trl import SFTConfig, SFTTrainer
 from transformers import AutoModelForCausalLM, AutoTokenizer


+
 # Define prompt templates
 ALPACA_PROMPT_TEMPLATE = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 ### Instruction: {}
@ -77,7 +78,7 @@ def main(args):
        "seed": 42,
        "dataset_text_field": "text",
        "packing": False,
-        "max_seq_length": args.seq_length,
+        "max_length": args.seq_length,
        "torch_compile": False,
        "report_to": "none",
        "logging_dir": args.log_dir,
@ -192,4 +193,4 @@ if __name__ == "__main__":
    print(f"Torch compile: {args.use_torch_compile}")
    print(f"{'='*60}\n")

-    main(args)
+    main(args)
--- a/nvidia/pytorch-fine-tune/assets/Llama3_70B_LoRA_finetuning.py
+++ b/nvidia/pytorch-fine-tune/assets/Llama3_70B_LoRA_finetuning.py
@ -0,0 +1,223 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import torch
+import argparse
+import os
+from datasets import load_dataset
+from trl import SFTConfig, SFTTrainer
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
+
+
+# Define prompt templates
+ALPACA_PROMPT_TEMPLATE = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+### Instruction: {}
+
+### Input: {}
+
+### Response: {}"""
+
+def get_alpaca_dataset(eos_token, dataset_size=500):
+    # Preprocess the dataset
+    def preprocess(x):
+        texts = [
+            ALPACA_PROMPT_TEMPLATE.format(instruction, input, output) + eos_token
+            for instruction, input, output in zip(x["instruction"], x["input"], x["output"])
+        ]
+        return {"text": texts}
+
+    dataset = load_dataset("tatsu-lab/alpaca", split="train").select(range(dataset_size)).shuffle(seed=42)
+    return dataset.map(preprocess, remove_columns=dataset.column_names, batched=True)
+
+
+def main(args):
+    # Load the model and tokenizer
+    print(f"Loading model: {args.model_name}")
+    print(f"Training mode: LoRA")
+    
+    
+    # When using FSDP, don't use device_map to avoid loading full model on one device
+    # FSDP will handle sharding and device placement
+    # With fsdp_cpu_ram_efficient_loading=true in config, model loads on meta device first,
+    # then FSDP shards and materializes it across all devices/nodes
+    # This prevents OOM when a single node can't hold the full model
+    
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name,
+        dtype=args.dtype,
+        low_cpu_mem_usage=True,  # Use lazy loading to reduce memory usage
+        trust_remote_code=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # Prepare model for LoRA training
+    print(f"Preparing model for LoRA with rank {args.lora_rank}...")
+    
+    peft_config = LoraConfig(
+        r=args.lora_rank,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", 
+                        "gate_proj", "up_proj", "down_proj"],
+        lora_alpha=16,
+        lora_dropout=0,
+        task_type=TaskType.CAUSAL_LM
+    )
+    
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    total_params = sum(p.numel() for p in model.parameters())
+    print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
+
+    # Load and preprocess the dataset
+    print(f"Loading dataset with {args.dataset_size} samples...")
+    dataset = get_alpaca_dataset(tokenizer.eos_token, args.dataset_size)
+
+    # Configure the SFT config
+    config = {
+        "per_device_train_batch_size": args.batch_size,
+        "num_train_epochs": 0.01,  # Warmup epoch
+        "gradient_accumulation_steps": args.gradient_accumulation_steps,
+        "learning_rate": args.learning_rate,
+        "optim": "adamw_torch",
+        "save_strategy": 'no',
+        "remove_unused_columns": False,
+        "seed": 42,
+        "dataset_text_field": "text",
+        "packing": False,
+        "max_length": args.seq_length,
+        "torch_compile": False,
+        "report_to": "none",
+        "logging_dir": args.log_dir,
+        "logging_steps": args.logging_steps,
+        "gradient_checkpointing": args.gradient_checkpointing
+    }
+
+    # Compile model if requested
+    if args.use_torch_compile:
+        print("Compiling model with torch.compile()...")
+        model = torch.compile(model)
+        
+        # Warmup for torch compile
+        print("Running warmup for torch.compile()...")
+        SFTTrainer(
+            model=model,
+            processing_class=tokenizer,
+            train_dataset=dataset,
+            args=SFTConfig(**config),
+        ).train()
+
+    # Train the model
+    print(f"\nStarting LoRA fine-tuning for {args.num_epochs} epoch(s)...")
+    config["num_train_epochs"] = args.num_epochs
+    config["report_to"] = "tensorboard"
+    
+    trainer = SFTTrainer(
+        model=model,
+        processing_class=tokenizer,
+        train_dataset=dataset,
+        args=SFTConfig(**config),
+        peft_config=peft_config,
+    )
+    
+    trainer_stats = trainer.train()
+    
+    # Print training statistics
+    print(f"\n{'='*60}")
+    print("TRAINING COMPLETED")
+    print(f"{'='*60}")
+    print(f"Training runtime: {trainer_stats.metrics['train_runtime']:.2f} seconds")
+    print(f"Samples per second: {trainer_stats.metrics['train_samples_per_second']:.2f}")
+    print(f"Steps per second: {trainer_stats.metrics['train_steps_per_second']:.2f}")
+    print(f"Train loss: {trainer_stats.metrics['train_loss']:.4f}")
+    print(f"{'='*60}\n")
+    
+    # Save model if requested
+    if args.output_dir:
+        print(f"Saving model to {args.output_dir}...")
+        trainer.save_model(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+        print("Model saved successfully!")
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Llama 3.1 70B Fine-tuning with LoRA")
+    
+    # Model configuration
+    parser.add_argument("--model_name", type=str, default="meta-llama/Llama-3.1-70B-Instruct",
+                        help="Model name or path")
+    parser.add_argument("--dtype", type=str, default="bfloat16",
+                        help="Model dtype (e.g., float32, float16, bfloat16)")
+    
+    # Training configuration
+    parser.add_argument("--batch_size", type=int, default=4,
+                        choices=[1, 2, 4, 8, 16, 32],
+                        help="Per device training batch size")
+    parser.add_argument("--seq_length", type=int, default=2048,
+                        choices=[256, 512, 1024, 2048, 4096, 8192],
+                        help="Maximum sequence length")
+    parser.add_argument("--num_epochs", type=int, default=1,
+                        help="Number of training epochs")
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1,
+                        help="Gradient accumulation steps")
+    parser.add_argument("--learning_rate", type=float, default=1e-4,
+                        help="Learning rate")
+    parser.add_argument("--gradient_checkpointing", action=argparse.BooleanOptionalAction, default=True,
+                        help="Enable gradient checkpointing to save memory (default: enabled)")
+    
+    # LoRA configuration
+    parser.add_argument("--lora_rank", type=int, default=8,
+                        help="LoRA rank")
+    
+    # Dataset configuration
+    parser.add_argument("--dataset_size", type=int, default=500,
+                        help="Number of samples to use from dataset")
+    
+    # Logging configuration
+    parser.add_argument("--logging_steps", type=int, default=1,
+                        help="Log every N steps")
+    parser.add_argument("--log_dir", type=str, default="logs",
+                        help="Directory for logs")
+    
+    # Compilation and saving
+    parser.add_argument("--use_torch_compile", action="store_true",
+                        help="Use torch.compile() for faster training")
+    parser.add_argument("--output_dir", type=str, default=None,
+                        help="Directory to save the fine-tuned model")
+    
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    print(f"\n{'='*60}")
+    print("LLAMA 3.1 70B LoRA FINE-TUNING")
+    print(f"{'='*60}")
+    print(f"Model: {args.model_name}")
+    print(f"Training mode: LoRA")
+    print(f"Batch size: {args.batch_size}")
+    print(f"Gradient accumulation: {args.gradient_accumulation_steps}")
+    print(f"Effective batch size: {args.batch_size * args.gradient_accumulation_steps}")
+    print(f"Sequence length: {args.seq_length}")
+    print(f"Number of epochs: {args.num_epochs}")
+    print(f"Learning rate: {args.learning_rate}")
+    print(f"LoRA rank: {args.lora_rank}")
+    print(f"Dataset size: {args.dataset_size}")
+    print(f"Gradient checkpointing: {args.gradient_checkpointing}")
+    print(f"Torch compile: {args.use_torch_compile}")
+    print(f"{'='*60}\n")
+    
+    main(args)
--- a/nvidia/pytorch-fine-tune/assets/Llama3_70B_qLoRA_finetuning.py
+++ b/nvidia/pytorch-fine-tune/assets/Llama3_70B_qLoRA_finetuning.py
@ -59,7 +59,8 @@ def main(args):
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4',
-        bnb_4bit_compute_dtype=getattr(torch, args.dtype),
+        bnb_4bit_compute_dtype=args.dtype,
+        bnb_4bit_quant_storage=args.dtype
    )
    
    model = AutoModelForCausalLM.from_pretrained(
@ -74,15 +75,15 @@ def main(args):

    # Prepare model for QLoRA training
    print(f"Preparing model for QLoRA (4-bit) with rank {args.lora_rank}...")
-    model = prepare_model_for_kbit_training(model)
+    # model = prepare_model_for_kbit_training(model)
    
-    model = get_peft_model(model, LoraConfig(
+    peft_config = LoraConfig(
        r=args.lora_rank,
-        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+        target_modules="all-linear",
        lora_alpha=16,
        lora_dropout=0,
        task_type=TaskType.CAUSAL_LM
-    ))
+    )
    
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
@ -104,7 +105,7 @@ def main(args):
        "seed": 42,
        "dataset_text_field": "text",
        "packing": False,
-        "max_seq_length": args.seq_length,
+        "max_length": args.seq_length,
        "torch_compile": False,
        "report_to": "none",
        "logging_dir": args.log_dir,
@ -136,6 +137,7 @@ def main(args):
        processing_class=tokenizer,
        train_dataset=dataset,
        args=SFTConfig(**config),
+        peft_config=peft_config,
    )
    
    trainer_stats = trainer.train()
@ -225,4 +227,4 @@ if __name__ == "__main__":
    print(f"Torch compile: {args.use_torch_compile}")
    print(f"{'='*60}\n")
    
-    main(args)
+    main(args)
--- a/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py
+++ b/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py
@ -56,12 +56,12 @@ def main(args):
    tokenizer.pad_token = tokenizer.eos_token

    # Configure LoRA config
-    model = get_peft_model(model, LoraConfig(
+    peft_config = LoraConfig(
        r=args.lora_rank,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha=16,
        lora_dropout=0,
-        task_type=TaskType.CAUSAL_LM))
+        task_type=TaskType.CAUSAL_LM)
    print(f"Trainable parameters = {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

    # Load and preprocess the dataset
@ -80,21 +80,26 @@ def main(args):
        "seed": 42,
        "dataset_text_field": "text",
        "packing": False,
-        "max_seq_length": args.seq_length,
+        "max_length": args.seq_length,
        "torch_compile": False,
        "report_to": "none",
        "logging_dir": args.log_dir,
        "logging_steps": args.logging_steps
    }

-    # Warmup for torch compile
-    model = torch.compile(model)
-    SFTTrainer(
-        model=model,
-        processing_class=tokenizer,
-        train_dataset=dataset,
-        args=SFTConfig(**config),
-    ).train()
+    # Compile model if requested
+    if args.use_torch_compile:
+        print("Compiling model with torch.compile()...")
+        model = torch.compile(model)
+        
+        # Warmup for torch compile
+        print("Running warmup for torch.compile()...")
+        SFTTrainer(
+            model=model,
+            processing_class=tokenizer,
+            train_dataset=dataset,
+            args=SFTConfig(**config),
+        ).train()

    # Train the model
    print(f"\nStarting LoRA fine-tuning for {args.num_epochs} epoch(s)...")
@ -105,6 +110,7 @@ def main(args):
        processing_class=tokenizer,
        train_dataset=dataset,
        args=SFTConfig(**config),
+        peft_config=peft_config,
    )
    
    trainer_stats = trainer.train()
@ -155,6 +161,9 @@ def parse_arguments():
                        help="Log every N steps")
    parser.add_argument("--log_dir", type=str, default="logs",
                        help="Directory for logs")
+    # Compilation
+    parser.add_argument("--use_torch_compile", action="store_true",
+                        help="Use torch.compile() for faster training")
    
    return parser.parse_args()

@ -171,6 +180,7 @@ if __name__ == "__main__":
    print(f"Learning rate: {args.learning_rate}")
    print(f"LoRA rank: {args.lora_rank}")
    print(f"Dataset size: {args.dataset_size}")
+    print(f"Torch compile: {args.use_torch_compile}")
    print(f"{'='*60}\n")
    
-    main(args)
+    main(args)
--- a/nvidia/pytorch-fine-tune/assets/configs/config_finetuning.yaml
+++ b/nvidia/pytorch-fine-tune/assets/configs/config_finetuning.yaml
@ -0,0 +1,32 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_activation_checkpointing: false
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: false
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_transformer_layer_cls_to_wrap: 'LlamaDecoderLayer'
+  fsdp_version: 2
+machine_rank: 0
+main_process_ip: < TODO: specify IP >
+main_process_port: < TODO: specify port >
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 2
+num_processes: 2
+parallelism_config:
+  parallelism_config_cp_size: 1
+  parallelism_config_dp_replicate_size: 1
+  parallelism_config_dp_shard_size: 2
+  parallelism_config_tp_size: 1
+rdzv_backend: c10d
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/nvidia/pytorch-fine-tune/assets/configs/config_fsdp_lora.yaml
+++ b/nvidia/pytorch-fine-tune/assets/configs/config_fsdp_lora.yaml
@ -0,0 +1,28 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: false
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_process_ip: < TODO: specify IP >
+main_process_port: < TODO: specify port >
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 2
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/nvidia/pytorch-fine-tune/assets/docker-compose.yml
+++ b/nvidia/pytorch-fine-tune/assets/docker-compose.yml
@ -0,0 +1,48 @@
+version: '3.8'
+
+services:
+  finetunine:
+    image: nvcr.io/nvidia/pytorch:25.10-py3
+    deploy:
+      replicas: 2
+      restart_policy:
+        condition: any
+        delay: 5s
+        max_attempts: 3
+        window: 120s
+      resources:
+        reservations:
+          generic_resources:
+            - discrete_resource_spec:
+                kind: 'NVIDIA_GPU'
+                value: 1
+    environment:
+      - UCX_NET_DEVICES=enp1s0f1np1
+      - NCCL_SOCKET_IFNAME=enp1s0f1np1
+      - NCCL_DEBUG=INFO
+      - TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+      - TORCH_DISTRIBUTED_DEBUG=DETAIL
+      - GLOO_SOCKET_IFNAME=enp1s0f1np1
+      - CUDA_DEVICE_MAX_CONNECTIONS=1
+      - CUDA_VISIBLE_DEVICES=0
+    entrypoint: /opt/pytorch-ft-entrypoint.sh
+    volumes:
+      - ${PWD}:/workspace
+      - ${PWD}/pytorch-ft-entrypoint.sh:/opt/pytorch-ft-entrypoint.sh
+      - ~/.cache/huggingface/:/root/.cache/huggingface/
+      - ~/.ssh:/tmp/.ssh:ro
+    ulimits:
+      memlock: -1
+      stack: 67108864
+    networks:
+      - host
+    healthcheck:
+      test: ["CMD", "service", "ssh", "status"]
+      interval: 30s
+      timeout: 10s
+      retries: 10
+
+networks:
+  host:
+    name: host
+    external: true
--- a/nvidia/pytorch-fine-tune/assets/install-requirements
+++ b/nvidia/pytorch-fine-tune/assets/install-requirements
@ -0,0 +1,18 @@
+#!/bin/bash
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+pip install "transformers>=4.57.1" peft datasets "trl>=0.25.1" "bitsandbytes>=0.48.2" "torchao==0.13.0"
--- a/nvidia/pytorch-fine-tune/assets/pytorch-ft-entrypoint.sh
+++ b/nvidia/pytorch-fine-tune/assets/pytorch-ft-entrypoint.sh
@ -0,0 +1,62 @@
+#!/bin/env bash
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+SSH_PORT="${SSH_PORT:-2233}"
+
+# Install and configure OpenSSH server
+apt-get update && \
+    apt-get install -y openssh-server && \
+    mkdir -p /var/run/sshd
+
+ls -lha /tmp/.ssh
+cp -R /tmp/.ssh /root/
+ls -lha /root/.ssh
+chown -R $USER: /root/.ssh
+chmod 700 /root/.ssh
+chmod 600 /root/.ssh/*
+if compgen -G "/root/.ssh/*.pub" > /dev/null; then
+    chmod 644 /root/.ssh/*.pub
+fi
+
+
+# Allow root login and key-based auth, move port to 2233
+sed -i.bak \
+    -e 's/^#\?\s*PermitRootLogin\s.*/PermitRootLogin yes/' \
+    -e 's/^#\?\s*PubkeyAuthentication\s.*/PubkeyAuthentication yes/' \
+    -e 's/^#\?\s*Port\s\+22\s*$/Port '$SSH_PORT'/' \
+    /etc/ssh/sshd_config
+
+# Set root password
+echo "root:root" | chpasswd
+
+# Configure SSH client for root to disable host key checks within *
+echo -e '\nHost *\n    StrictHostKeyChecking no\n    Port '$SSH_PORT'\n    UserKnownHostsFile=/dev/null' > /etc/ssh/ssh_config.d/pyt-ft.conf && \
+    chmod 600 /etc/ssh/ssh_config.d/pyt-ft.conf
+
+# Fix login session for container
+sed 's@session\\s*required\\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
+
+
+# Start SSHD
+echo "Starting SSH"
+exec /usr/sbin/sshd -D
+sshd_rc = $?
+echo "Failed to start SSHD, rc $sshd_rc"
+exit $sshd_rc
--- a/nvidia/pytorch-fine-tune/assets/run-multi-llama_3b
+++ b/nvidia/pytorch-fine-tune/assets/run-multi-llama_3b
@ -0,0 +1,22 @@
+#!/bin/bash
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+docker exec \
+  -e HF_TOKEN=$HF_TOKEN \
+  -it $FINETUNING_CONTAINER bash -c '
+  bash /workspace/install-requirements;
+  accelerate launch --config_file=/workspace/configs/config_finetuning.yaml /workspace/Llama3_3B_full_finetuning.py'
--- a/nvidia/pytorch-fine-tune/assets/run-multi-llama_70b
+++ b/nvidia/pytorch-fine-tune/assets/run-multi-llama_70b
@ -0,0 +1,22 @@
+#!/bin/bash
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+docker exec \
+  -e HF_TOKEN=$HF_TOKEN \
+  -it $FINETUNING_CONTAINER bash -c '
+  bash /workspace/install-requirements;
+  accelerate launch --config_file=/workspace/configs/config_fsdp_lora.yaml /workspace/Llama3_70B_LoRA_finetuning.py'
--- a/nvidia/pytorch-fine-tune/assets/run-multi-llama_8b
+++ b/nvidia/pytorch-fine-tune/assets/run-multi-llama_8b
@ -0,0 +1,22 @@
+#!/bin/bash
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+docker exec \
+  -e HF_TOKEN=$HF_TOKEN \
+  -it $FINETUNING_CONTAINER bash -c '
+  bash /workspace/install-requirements;
+  accelerate launch --config_file=/workspace/configs/config_fsdp_lora.yaml /workspace/Llama3_8B_LoRA_finetuning.py'