diff --git a/nvidia/pytorch-fine-tune/assets/Llama3_3B_full_finetuning.py b/nvidia/pytorch-fine-tune/assets/Llama3_3B_full_finetuning.py index f829a1b..b40a5ca 100644 --- a/nvidia/pytorch-fine-tune/assets/Llama3_3B_full_finetuning.py +++ b/nvidia/pytorch-fine-tune/assets/Llama3_3B_full_finetuning.py @@ -22,6 +22,7 @@ from trl import SFTConfig, SFTTrainer from transformers import AutoModelForCausalLM, AutoTokenizer + # Define prompt templates ALPACA_PROMPT_TEMPLATE = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {} @@ -77,7 +78,7 @@ def main(args): "seed": 42, "dataset_text_field": "text", "packing": False, - "max_seq_length": args.seq_length, + "max_length": args.seq_length, "torch_compile": False, "report_to": "none", "logging_dir": args.log_dir, @@ -192,4 +193,4 @@ if __name__ == "__main__": print(f"Torch compile: {args.use_torch_compile}") print(f"{'='*60}\n") - main(args) \ No newline at end of file + main(args) diff --git a/nvidia/pytorch-fine-tune/assets/Llama3_70B_LoRA_finetuning.py b/nvidia/pytorch-fine-tune/assets/Llama3_70B_LoRA_finetuning.py new file mode 100644 index 0000000..b8731df --- /dev/null +++ b/nvidia/pytorch-fine-tune/assets/Llama3_70B_LoRA_finetuning.py @@ -0,0 +1,223 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import argparse +import os +from datasets import load_dataset +from trl import SFTConfig, SFTTrainer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training + + +# Define prompt templates +ALPACA_PROMPT_TEMPLATE = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. +### Instruction: {} + +### Input: {} + +### Response: {}""" + +def get_alpaca_dataset(eos_token, dataset_size=500): + # Preprocess the dataset + def preprocess(x): + texts = [ + ALPACA_PROMPT_TEMPLATE.format(instruction, input, output) + eos_token + for instruction, input, output in zip(x["instruction"], x["input"], x["output"]) + ] + return {"text": texts} + + dataset = load_dataset("tatsu-lab/alpaca", split="train").select(range(dataset_size)).shuffle(seed=42) + return dataset.map(preprocess, remove_columns=dataset.column_names, batched=True) + + +def main(args): + # Load the model and tokenizer + print(f"Loading model: {args.model_name}") + print(f"Training mode: LoRA") + + + # When using FSDP, don't use device_map to avoid loading full model on one device + # FSDP will handle sharding and device placement + # With fsdp_cpu_ram_efficient_loading=true in config, model loads on meta device first, + # then FSDP shards and materializes it across all devices/nodes + # This prevents OOM when a single node can't hold the full model + + model = AutoModelForCausalLM.from_pretrained( + args.model_name, + dtype=args.dtype, + low_cpu_mem_usage=True, # Use lazy loading to reduce memory usage + trust_remote_code=True + ) + tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True) + tokenizer.pad_token = tokenizer.eos_token + + # Prepare model for LoRA training + print(f"Preparing model for LoRA with rank {args.lora_rank}...") + + peft_config = LoraConfig( + r=args.lora_rank, + target_modules=["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj"], + lora_alpha=16, + lora_dropout=0, + task_type=TaskType.CAUSAL_LM + ) + + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + total_params = sum(p.numel() for p in model.parameters()) + print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)") + + # Load and preprocess the dataset + print(f"Loading dataset with {args.dataset_size} samples...") + dataset = get_alpaca_dataset(tokenizer.eos_token, args.dataset_size) + + # Configure the SFT config + config = { + "per_device_train_batch_size": args.batch_size, + "num_train_epochs": 0.01, # Warmup epoch + "gradient_accumulation_steps": args.gradient_accumulation_steps, + "learning_rate": args.learning_rate, + "optim": "adamw_torch", + "save_strategy": 'no', + "remove_unused_columns": False, + "seed": 42, + "dataset_text_field": "text", + "packing": False, + "max_length": args.seq_length, + "torch_compile": False, + "report_to": "none", + "logging_dir": args.log_dir, + "logging_steps": args.logging_steps, + "gradient_checkpointing": args.gradient_checkpointing + } + + # Compile model if requested + if args.use_torch_compile: + print("Compiling model with torch.compile()...") + model = torch.compile(model) + + # Warmup for torch compile + print("Running warmup for torch.compile()...") + SFTTrainer( + model=model, + processing_class=tokenizer, + train_dataset=dataset, + args=SFTConfig(**config), + ).train() + + # Train the model + print(f"\nStarting LoRA fine-tuning for {args.num_epochs} epoch(s)...") + config["num_train_epochs"] = args.num_epochs + config["report_to"] = "tensorboard" + + trainer = SFTTrainer( + model=model, + processing_class=tokenizer, + train_dataset=dataset, + args=SFTConfig(**config), + peft_config=peft_config, + ) + + trainer_stats = trainer.train() + + # Print training statistics + print(f"\n{'='*60}") + print("TRAINING COMPLETED") + print(f"{'='*60}") + print(f"Training runtime: {trainer_stats.metrics['train_runtime']:.2f} seconds") + print(f"Samples per second: {trainer_stats.metrics['train_samples_per_second']:.2f}") + print(f"Steps per second: {trainer_stats.metrics['train_steps_per_second']:.2f}") + print(f"Train loss: {trainer_stats.metrics['train_loss']:.4f}") + print(f"{'='*60}\n") + + # Save model if requested + if args.output_dir: + print(f"Saving model to {args.output_dir}...") + trainer.save_model(args.output_dir) + tokenizer.save_pretrained(args.output_dir) + print("Model saved successfully!") + + +def parse_arguments(): + parser = argparse.ArgumentParser(description="Llama 3.1 70B Fine-tuning with LoRA") + + # Model configuration + parser.add_argument("--model_name", type=str, default="meta-llama/Llama-3.1-70B-Instruct", + help="Model name or path") + parser.add_argument("--dtype", type=str, default="bfloat16", + help="Model dtype (e.g., float32, float16, bfloat16)") + + # Training configuration + parser.add_argument("--batch_size", type=int, default=4, + choices=[1, 2, 4, 8, 16, 32], + help="Per device training batch size") + parser.add_argument("--seq_length", type=int, default=2048, + choices=[256, 512, 1024, 2048, 4096, 8192], + help="Maximum sequence length") + parser.add_argument("--num_epochs", type=int, default=1, + help="Number of training epochs") + parser.add_argument("--gradient_accumulation_steps", type=int, default=1, + help="Gradient accumulation steps") + parser.add_argument("--learning_rate", type=float, default=1e-4, + help="Learning rate") + parser.add_argument("--gradient_checkpointing", action=argparse.BooleanOptionalAction, default=True, + help="Enable gradient checkpointing to save memory (default: enabled)") + + # LoRA configuration + parser.add_argument("--lora_rank", type=int, default=8, + help="LoRA rank") + + # Dataset configuration + parser.add_argument("--dataset_size", type=int, default=500, + help="Number of samples to use from dataset") + + # Logging configuration + parser.add_argument("--logging_steps", type=int, default=1, + help="Log every N steps") + parser.add_argument("--log_dir", type=str, default="logs", + help="Directory for logs") + + # Compilation and saving + parser.add_argument("--use_torch_compile", action="store_true", + help="Use torch.compile() for faster training") + parser.add_argument("--output_dir", type=str, default=None, + help="Directory to save the fine-tuned model") + + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_arguments() + print(f"\n{'='*60}") + print("LLAMA 3.1 70B LoRA FINE-TUNING") + print(f"{'='*60}") + print(f"Model: {args.model_name}") + print(f"Training mode: LoRA") + print(f"Batch size: {args.batch_size}") + print(f"Gradient accumulation: {args.gradient_accumulation_steps}") + print(f"Effective batch size: {args.batch_size * args.gradient_accumulation_steps}") + print(f"Sequence length: {args.seq_length}") + print(f"Number of epochs: {args.num_epochs}") + print(f"Learning rate: {args.learning_rate}") + print(f"LoRA rank: {args.lora_rank}") + print(f"Dataset size: {args.dataset_size}") + print(f"Gradient checkpointing: {args.gradient_checkpointing}") + print(f"Torch compile: {args.use_torch_compile}") + print(f"{'='*60}\n") + + main(args) diff --git a/nvidia/pytorch-fine-tune/assets/Llama3_70B_qLoRA_finetuning.py b/nvidia/pytorch-fine-tune/assets/Llama3_70B_qLoRA_finetuning.py index c633636..f6e001b 100644 --- a/nvidia/pytorch-fine-tune/assets/Llama3_70B_qLoRA_finetuning.py +++ b/nvidia/pytorch-fine-tune/assets/Llama3_70B_qLoRA_finetuning.py @@ -59,7 +59,8 @@ def main(args): load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type='nf4', - bnb_4bit_compute_dtype=getattr(torch, args.dtype), + bnb_4bit_compute_dtype=args.dtype, + bnb_4bit_quant_storage=args.dtype ) model = AutoModelForCausalLM.from_pretrained( @@ -74,15 +75,15 @@ def main(args): # Prepare model for QLoRA training print(f"Preparing model for QLoRA (4-bit) with rank {args.lora_rank}...") - model = prepare_model_for_kbit_training(model) + # model = prepare_model_for_kbit_training(model) - model = get_peft_model(model, LoraConfig( + peft_config = LoraConfig( r=args.lora_rank, - target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], + target_modules="all-linear", lora_alpha=16, lora_dropout=0, task_type=TaskType.CAUSAL_LM - )) + ) trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) total_params = sum(p.numel() for p in model.parameters()) @@ -104,7 +105,7 @@ def main(args): "seed": 42, "dataset_text_field": "text", "packing": False, - "max_seq_length": args.seq_length, + "max_length": args.seq_length, "torch_compile": False, "report_to": "none", "logging_dir": args.log_dir, @@ -136,6 +137,7 @@ def main(args): processing_class=tokenizer, train_dataset=dataset, args=SFTConfig(**config), + peft_config=peft_config, ) trainer_stats = trainer.train() @@ -225,4 +227,4 @@ if __name__ == "__main__": print(f"Torch compile: {args.use_torch_compile}") print(f"{'='*60}\n") - main(args) \ No newline at end of file + main(args) diff --git a/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py b/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py index 34b9e15..a0e68ed 100644 --- a/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py +++ b/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py @@ -56,12 +56,12 @@ def main(args): tokenizer.pad_token = tokenizer.eos_token # Configure LoRA config - model = get_peft_model(model, LoraConfig( + peft_config = LoraConfig( r=args.lora_rank, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_alpha=16, lora_dropout=0, - task_type=TaskType.CAUSAL_LM)) + task_type=TaskType.CAUSAL_LM) print(f"Trainable parameters = {sum(p.numel() for p in model.parameters() if p.requires_grad):,}") # Load and preprocess the dataset @@ -80,21 +80,26 @@ def main(args): "seed": 42, "dataset_text_field": "text", "packing": False, - "max_seq_length": args.seq_length, + "max_length": args.seq_length, "torch_compile": False, "report_to": "none", "logging_dir": args.log_dir, "logging_steps": args.logging_steps } - # Warmup for torch compile - model = torch.compile(model) - SFTTrainer( - model=model, - processing_class=tokenizer, - train_dataset=dataset, - args=SFTConfig(**config), - ).train() + # Compile model if requested + if args.use_torch_compile: + print("Compiling model with torch.compile()...") + model = torch.compile(model) + + # Warmup for torch compile + print("Running warmup for torch.compile()...") + SFTTrainer( + model=model, + processing_class=tokenizer, + train_dataset=dataset, + args=SFTConfig(**config), + ).train() # Train the model print(f"\nStarting LoRA fine-tuning for {args.num_epochs} epoch(s)...") @@ -105,6 +110,7 @@ def main(args): processing_class=tokenizer, train_dataset=dataset, args=SFTConfig(**config), + peft_config=peft_config, ) trainer_stats = trainer.train() @@ -155,6 +161,9 @@ def parse_arguments(): help="Log every N steps") parser.add_argument("--log_dir", type=str, default="logs", help="Directory for logs") + # Compilation + parser.add_argument("--use_torch_compile", action="store_true", + help="Use torch.compile() for faster training") return parser.parse_args() @@ -171,6 +180,7 @@ if __name__ == "__main__": print(f"Learning rate: {args.learning_rate}") print(f"LoRA rank: {args.lora_rank}") print(f"Dataset size: {args.dataset_size}") + print(f"Torch compile: {args.use_torch_compile}") print(f"{'='*60}\n") - main(args) \ No newline at end of file + main(args) diff --git a/nvidia/pytorch-fine-tune/assets/configs/config_finetuning.yaml b/nvidia/pytorch-fine-tune/assets/configs/config_finetuning.yaml new file mode 100644 index 0000000..29e2960 --- /dev/null +++ b/nvidia/pytorch-fine-tune/assets/configs/config_finetuning.yaml @@ -0,0 +1,32 @@ +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: FSDP +downcast_bf16: 'no' +enable_cpu_affinity: false +fsdp_config: + fsdp_activation_checkpointing: false + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_cpu_ram_efficient_loading: true + fsdp_offload_params: false + fsdp_reshard_after_forward: false + fsdp_state_dict_type: FULL_STATE_DICT + fsdp_transformer_layer_cls_to_wrap: 'LlamaDecoderLayer' + fsdp_version: 2 +machine_rank: 0 +main_process_ip: < TODO: specify IP > +main_process_port: < TODO: specify port > +main_training_function: main +mixed_precision: 'bf16' +num_machines: 2 +num_processes: 2 +parallelism_config: + parallelism_config_cp_size: 1 + parallelism_config_dp_replicate_size: 1 + parallelism_config_dp_shard_size: 2 + parallelism_config_tp_size: 1 +rdzv_backend: c10d +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false diff --git a/nvidia/pytorch-fine-tune/assets/configs/config_fsdp_lora.yaml b/nvidia/pytorch-fine-tune/assets/configs/config_fsdp_lora.yaml new file mode 100644 index 0000000..768c31d --- /dev/null +++ b/nvidia/pytorch-fine-tune/assets/configs/config_fsdp_lora.yaml @@ -0,0 +1,28 @@ +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: FSDP +downcast_bf16: 'no' +enable_cpu_affinity: false +fsdp_config: + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_backward_prefetch: BACKWARD_PRE + fsdp_cpu_ram_efficient_loading: true + fsdp_forward_prefetch: false + fsdp_offload_params: false + fsdp_sharding_strategy: FULL_SHARD + fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_sync_module_states: true + fsdp_use_orig_params: true +machine_rank: 0 +main_process_ip: < TODO: specify IP > +main_process_port: < TODO: specify port > +main_training_function: main +mixed_precision: 'bf16' +num_machines: 2 +num_processes: 2 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false diff --git a/nvidia/pytorch-fine-tune/assets/docker-compose.yml b/nvidia/pytorch-fine-tune/assets/docker-compose.yml new file mode 100644 index 0000000..dcdf116 --- /dev/null +++ b/nvidia/pytorch-fine-tune/assets/docker-compose.yml @@ -0,0 +1,48 @@ +version: '3.8' + +services: + finetunine: + image: nvcr.io/nvidia/pytorch:25.10-py3 + deploy: + replicas: 2 + restart_policy: + condition: any + delay: 5s + max_attempts: 3 + window: 120s + resources: + reservations: + generic_resources: + - discrete_resource_spec: + kind: 'NVIDIA_GPU' + value: 1 + environment: + - UCX_NET_DEVICES=enp1s0f1np1 + - NCCL_SOCKET_IFNAME=enp1s0f1np1 + - NCCL_DEBUG=INFO + - TORCH_NCCL_ASYNC_ERROR_HANDLING=1 + - TORCH_DISTRIBUTED_DEBUG=DETAIL + - GLOO_SOCKET_IFNAME=enp1s0f1np1 + - CUDA_DEVICE_MAX_CONNECTIONS=1 + - CUDA_VISIBLE_DEVICES=0 + entrypoint: /opt/pytorch-ft-entrypoint.sh + volumes: + - ${PWD}:/workspace + - ${PWD}/pytorch-ft-entrypoint.sh:/opt/pytorch-ft-entrypoint.sh + - ~/.cache/huggingface/:/root/.cache/huggingface/ + - ~/.ssh:/tmp/.ssh:ro + ulimits: + memlock: -1 + stack: 67108864 + networks: + - host + healthcheck: + test: ["CMD", "service", "ssh", "status"] + interval: 30s + timeout: 10s + retries: 10 + +networks: + host: + name: host + external: true diff --git a/nvidia/pytorch-fine-tune/assets/install-requirements b/nvidia/pytorch-fine-tune/assets/install-requirements new file mode 100755 index 0000000..1c5a4c8 --- /dev/null +++ b/nvidia/pytorch-fine-tune/assets/install-requirements @@ -0,0 +1,18 @@ +#!/bin/bash +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +pip install "transformers>=4.57.1" peft datasets "trl>=0.25.1" "bitsandbytes>=0.48.2" "torchao==0.13.0" diff --git a/nvidia/pytorch-fine-tune/assets/pytorch-ft-entrypoint.sh b/nvidia/pytorch-fine-tune/assets/pytorch-ft-entrypoint.sh new file mode 100755 index 0000000..ada3266 --- /dev/null +++ b/nvidia/pytorch-fine-tune/assets/pytorch-ft-entrypoint.sh @@ -0,0 +1,62 @@ +#!/bin/env bash +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -e + +SSH_PORT="${SSH_PORT:-2233}" + +# Install and configure OpenSSH server +apt-get update && \ + apt-get install -y openssh-server && \ + mkdir -p /var/run/sshd + +ls -lha /tmp/.ssh +cp -R /tmp/.ssh /root/ +ls -lha /root/.ssh +chown -R $USER: /root/.ssh +chmod 700 /root/.ssh +chmod 600 /root/.ssh/* +if compgen -G "/root/.ssh/*.pub" > /dev/null; then + chmod 644 /root/.ssh/*.pub +fi + + +# Allow root login and key-based auth, move port to 2233 +sed -i.bak \ + -e 's/^#\?\s*PermitRootLogin\s.*/PermitRootLogin yes/' \ + -e 's/^#\?\s*PubkeyAuthentication\s.*/PubkeyAuthentication yes/' \ + -e 's/^#\?\s*Port\s\+22\s*$/Port '$SSH_PORT'/' \ + /etc/ssh/sshd_config + +# Set root password +echo "root:root" | chpasswd + +# Configure SSH client for root to disable host key checks within * +echo -e '\nHost *\n StrictHostKeyChecking no\n Port '$SSH_PORT'\n UserKnownHostsFile=/dev/null' > /etc/ssh/ssh_config.d/pyt-ft.conf && \ + chmod 600 /etc/ssh/ssh_config.d/pyt-ft.conf + +# Fix login session for container +sed 's@session\\s*required\\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + + +# Start SSHD +echo "Starting SSH" +exec /usr/sbin/sshd -D +sshd_rc = $? +echo "Failed to start SSHD, rc $sshd_rc" +exit $sshd_rc diff --git a/nvidia/pytorch-fine-tune/assets/run-multi-llama_3b b/nvidia/pytorch-fine-tune/assets/run-multi-llama_3b new file mode 100755 index 0000000..0763176 --- /dev/null +++ b/nvidia/pytorch-fine-tune/assets/run-multi-llama_3b @@ -0,0 +1,22 @@ +#!/bin/bash +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +docker exec \ + -e HF_TOKEN=$HF_TOKEN \ + -it $FINETUNING_CONTAINER bash -c ' + bash /workspace/install-requirements; + accelerate launch --config_file=/workspace/configs/config_finetuning.yaml /workspace/Llama3_3B_full_finetuning.py' \ No newline at end of file diff --git a/nvidia/pytorch-fine-tune/assets/run-multi-llama_70b b/nvidia/pytorch-fine-tune/assets/run-multi-llama_70b new file mode 100755 index 0000000..84ac384 --- /dev/null +++ b/nvidia/pytorch-fine-tune/assets/run-multi-llama_70b @@ -0,0 +1,22 @@ +#!/bin/bash +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +docker exec \ + -e HF_TOKEN=$HF_TOKEN \ + -it $FINETUNING_CONTAINER bash -c ' + bash /workspace/install-requirements; + accelerate launch --config_file=/workspace/configs/config_fsdp_lora.yaml /workspace/Llama3_70B_LoRA_finetuning.py' \ No newline at end of file diff --git a/nvidia/pytorch-fine-tune/assets/run-multi-llama_8b b/nvidia/pytorch-fine-tune/assets/run-multi-llama_8b new file mode 100755 index 0000000..0555d2b --- /dev/null +++ b/nvidia/pytorch-fine-tune/assets/run-multi-llama_8b @@ -0,0 +1,22 @@ +#!/bin/bash +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +docker exec \ + -e HF_TOKEN=$HF_TOKEN \ + -it $FINETUNING_CONTAINER bash -c ' + bash /workspace/install-requirements; + accelerate launch --config_file=/workspace/configs/config_fsdp_lora.yaml /workspace/Llama3_8B_LoRA_finetuning.py' \ No newline at end of file