mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-23 02:23:53 +00:00
chore: Regenerate all playbooks
This commit is contained in:
parent
8f8e2e6f5d
commit
a8a81dd1a1
@ -22,6 +22,7 @@ from trl import SFTConfig, SFTTrainer
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
|
||||
|
||||
# Define prompt templates
|
||||
ALPACA_PROMPT_TEMPLATE = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
|
||||
### Instruction: {}
|
||||
@ -77,7 +78,7 @@ def main(args):
|
||||
"seed": 42,
|
||||
"dataset_text_field": "text",
|
||||
"packing": False,
|
||||
"max_seq_length": args.seq_length,
|
||||
"max_length": args.seq_length,
|
||||
"torch_compile": False,
|
||||
"report_to": "none",
|
||||
"logging_dir": args.log_dir,
|
||||
@ -192,4 +193,4 @@ if __name__ == "__main__":
|
||||
print(f"Torch compile: {args.use_torch_compile}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
main(args)
|
||||
main(args)
|
||||
|
||||
223
nvidia/pytorch-fine-tune/assets/Llama3_70B_LoRA_finetuning.py
Normal file
223
nvidia/pytorch-fine-tune/assets/Llama3_70B_LoRA_finetuning.py
Normal file
@ -0,0 +1,223 @@
|
||||
#
|
||||
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import torch
|
||||
import argparse
|
||||
import os
|
||||
from datasets import load_dataset
|
||||
from trl import SFTConfig, SFTTrainer
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
||||
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
|
||||
|
||||
|
||||
# Define prompt templates
|
||||
ALPACA_PROMPT_TEMPLATE = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
|
||||
### Instruction: {}
|
||||
|
||||
### Input: {}
|
||||
|
||||
### Response: {}"""
|
||||
|
||||
def get_alpaca_dataset(eos_token, dataset_size=500):
|
||||
# Preprocess the dataset
|
||||
def preprocess(x):
|
||||
texts = [
|
||||
ALPACA_PROMPT_TEMPLATE.format(instruction, input, output) + eos_token
|
||||
for instruction, input, output in zip(x["instruction"], x["input"], x["output"])
|
||||
]
|
||||
return {"text": texts}
|
||||
|
||||
dataset = load_dataset("tatsu-lab/alpaca", split="train").select(range(dataset_size)).shuffle(seed=42)
|
||||
return dataset.map(preprocess, remove_columns=dataset.column_names, batched=True)
|
||||
|
||||
|
||||
def main(args):
|
||||
# Load the model and tokenizer
|
||||
print(f"Loading model: {args.model_name}")
|
||||
print(f"Training mode: LoRA")
|
||||
|
||||
|
||||
# When using FSDP, don't use device_map to avoid loading full model on one device
|
||||
# FSDP will handle sharding and device placement
|
||||
# With fsdp_cpu_ram_efficient_loading=true in config, model loads on meta device first,
|
||||
# then FSDP shards and materializes it across all devices/nodes
|
||||
# This prevents OOM when a single node can't hold the full model
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
args.model_name,
|
||||
dtype=args.dtype,
|
||||
low_cpu_mem_usage=True, # Use lazy loading to reduce memory usage
|
||||
trust_remote_code=True
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
# Prepare model for LoRA training
|
||||
print(f"Preparing model for LoRA with rank {args.lora_rank}...")
|
||||
|
||||
peft_config = LoraConfig(
|
||||
r=args.lora_rank,
|
||||
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
|
||||
"gate_proj", "up_proj", "down_proj"],
|
||||
lora_alpha=16,
|
||||
lora_dropout=0,
|
||||
task_type=TaskType.CAUSAL_LM
|
||||
)
|
||||
|
||||
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
total_params = sum(p.numel() for p in model.parameters())
|
||||
print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
|
||||
|
||||
# Load and preprocess the dataset
|
||||
print(f"Loading dataset with {args.dataset_size} samples...")
|
||||
dataset = get_alpaca_dataset(tokenizer.eos_token, args.dataset_size)
|
||||
|
||||
# Configure the SFT config
|
||||
config = {
|
||||
"per_device_train_batch_size": args.batch_size,
|
||||
"num_train_epochs": 0.01, # Warmup epoch
|
||||
"gradient_accumulation_steps": args.gradient_accumulation_steps,
|
||||
"learning_rate": args.learning_rate,
|
||||
"optim": "adamw_torch",
|
||||
"save_strategy": 'no',
|
||||
"remove_unused_columns": False,
|
||||
"seed": 42,
|
||||
"dataset_text_field": "text",
|
||||
"packing": False,
|
||||
"max_length": args.seq_length,
|
||||
"torch_compile": False,
|
||||
"report_to": "none",
|
||||
"logging_dir": args.log_dir,
|
||||
"logging_steps": args.logging_steps,
|
||||
"gradient_checkpointing": args.gradient_checkpointing
|
||||
}
|
||||
|
||||
# Compile model if requested
|
||||
if args.use_torch_compile:
|
||||
print("Compiling model with torch.compile()...")
|
||||
model = torch.compile(model)
|
||||
|
||||
# Warmup for torch compile
|
||||
print("Running warmup for torch.compile()...")
|
||||
SFTTrainer(
|
||||
model=model,
|
||||
processing_class=tokenizer,
|
||||
train_dataset=dataset,
|
||||
args=SFTConfig(**config),
|
||||
).train()
|
||||
|
||||
# Train the model
|
||||
print(f"\nStarting LoRA fine-tuning for {args.num_epochs} epoch(s)...")
|
||||
config["num_train_epochs"] = args.num_epochs
|
||||
config["report_to"] = "tensorboard"
|
||||
|
||||
trainer = SFTTrainer(
|
||||
model=model,
|
||||
processing_class=tokenizer,
|
||||
train_dataset=dataset,
|
||||
args=SFTConfig(**config),
|
||||
peft_config=peft_config,
|
||||
)
|
||||
|
||||
trainer_stats = trainer.train()
|
||||
|
||||
# Print training statistics
|
||||
print(f"\n{'='*60}")
|
||||
print("TRAINING COMPLETED")
|
||||
print(f"{'='*60}")
|
||||
print(f"Training runtime: {trainer_stats.metrics['train_runtime']:.2f} seconds")
|
||||
print(f"Samples per second: {trainer_stats.metrics['train_samples_per_second']:.2f}")
|
||||
print(f"Steps per second: {trainer_stats.metrics['train_steps_per_second']:.2f}")
|
||||
print(f"Train loss: {trainer_stats.metrics['train_loss']:.4f}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Save model if requested
|
||||
if args.output_dir:
|
||||
print(f"Saving model to {args.output_dir}...")
|
||||
trainer.save_model(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
print("Model saved successfully!")
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(description="Llama 3.1 70B Fine-tuning with LoRA")
|
||||
|
||||
# Model configuration
|
||||
parser.add_argument("--model_name", type=str, default="meta-llama/Llama-3.1-70B-Instruct",
|
||||
help="Model name or path")
|
||||
parser.add_argument("--dtype", type=str, default="bfloat16",
|
||||
help="Model dtype (e.g., float32, float16, bfloat16)")
|
||||
|
||||
# Training configuration
|
||||
parser.add_argument("--batch_size", type=int, default=4,
|
||||
choices=[1, 2, 4, 8, 16, 32],
|
||||
help="Per device training batch size")
|
||||
parser.add_argument("--seq_length", type=int, default=2048,
|
||||
choices=[256, 512, 1024, 2048, 4096, 8192],
|
||||
help="Maximum sequence length")
|
||||
parser.add_argument("--num_epochs", type=int, default=1,
|
||||
help="Number of training epochs")
|
||||
parser.add_argument("--gradient_accumulation_steps", type=int, default=1,
|
||||
help="Gradient accumulation steps")
|
||||
parser.add_argument("--learning_rate", type=float, default=1e-4,
|
||||
help="Learning rate")
|
||||
parser.add_argument("--gradient_checkpointing", action=argparse.BooleanOptionalAction, default=True,
|
||||
help="Enable gradient checkpointing to save memory (default: enabled)")
|
||||
|
||||
# LoRA configuration
|
||||
parser.add_argument("--lora_rank", type=int, default=8,
|
||||
help="LoRA rank")
|
||||
|
||||
# Dataset configuration
|
||||
parser.add_argument("--dataset_size", type=int, default=500,
|
||||
help="Number of samples to use from dataset")
|
||||
|
||||
# Logging configuration
|
||||
parser.add_argument("--logging_steps", type=int, default=1,
|
||||
help="Log every N steps")
|
||||
parser.add_argument("--log_dir", type=str, default="logs",
|
||||
help="Directory for logs")
|
||||
|
||||
# Compilation and saving
|
||||
parser.add_argument("--use_torch_compile", action="store_true",
|
||||
help="Use torch.compile() for faster training")
|
||||
parser.add_argument("--output_dir", type=str, default=None,
|
||||
help="Directory to save the fine-tuned model")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_arguments()
|
||||
print(f"\n{'='*60}")
|
||||
print("LLAMA 3.1 70B LoRA FINE-TUNING")
|
||||
print(f"{'='*60}")
|
||||
print(f"Model: {args.model_name}")
|
||||
print(f"Training mode: LoRA")
|
||||
print(f"Batch size: {args.batch_size}")
|
||||
print(f"Gradient accumulation: {args.gradient_accumulation_steps}")
|
||||
print(f"Effective batch size: {args.batch_size * args.gradient_accumulation_steps}")
|
||||
print(f"Sequence length: {args.seq_length}")
|
||||
print(f"Number of epochs: {args.num_epochs}")
|
||||
print(f"Learning rate: {args.learning_rate}")
|
||||
print(f"LoRA rank: {args.lora_rank}")
|
||||
print(f"Dataset size: {args.dataset_size}")
|
||||
print(f"Gradient checkpointing: {args.gradient_checkpointing}")
|
||||
print(f"Torch compile: {args.use_torch_compile}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
main(args)
|
||||
@ -59,7 +59,8 @@ def main(args):
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_quant_type='nf4',
|
||||
bnb_4bit_compute_dtype=getattr(torch, args.dtype),
|
||||
bnb_4bit_compute_dtype=args.dtype,
|
||||
bnb_4bit_quant_storage=args.dtype
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
@ -74,15 +75,15 @@ def main(args):
|
||||
|
||||
# Prepare model for QLoRA training
|
||||
print(f"Preparing model for QLoRA (4-bit) with rank {args.lora_rank}...")
|
||||
model = prepare_model_for_kbit_training(model)
|
||||
# model = prepare_model_for_kbit_training(model)
|
||||
|
||||
model = get_peft_model(model, LoraConfig(
|
||||
peft_config = LoraConfig(
|
||||
r=args.lora_rank,
|
||||
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
|
||||
target_modules="all-linear",
|
||||
lora_alpha=16,
|
||||
lora_dropout=0,
|
||||
task_type=TaskType.CAUSAL_LM
|
||||
))
|
||||
)
|
||||
|
||||
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
total_params = sum(p.numel() for p in model.parameters())
|
||||
@ -104,7 +105,7 @@ def main(args):
|
||||
"seed": 42,
|
||||
"dataset_text_field": "text",
|
||||
"packing": False,
|
||||
"max_seq_length": args.seq_length,
|
||||
"max_length": args.seq_length,
|
||||
"torch_compile": False,
|
||||
"report_to": "none",
|
||||
"logging_dir": args.log_dir,
|
||||
@ -136,6 +137,7 @@ def main(args):
|
||||
processing_class=tokenizer,
|
||||
train_dataset=dataset,
|
||||
args=SFTConfig(**config),
|
||||
peft_config=peft_config,
|
||||
)
|
||||
|
||||
trainer_stats = trainer.train()
|
||||
@ -225,4 +227,4 @@ if __name__ == "__main__":
|
||||
print(f"Torch compile: {args.use_torch_compile}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
main(args)
|
||||
main(args)
|
||||
|
||||
@ -56,12 +56,12 @@ def main(args):
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
# Configure LoRA config
|
||||
model = get_peft_model(model, LoraConfig(
|
||||
peft_config = LoraConfig(
|
||||
r=args.lora_rank,
|
||||
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
|
||||
lora_alpha=16,
|
||||
lora_dropout=0,
|
||||
task_type=TaskType.CAUSAL_LM))
|
||||
task_type=TaskType.CAUSAL_LM)
|
||||
print(f"Trainable parameters = {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
|
||||
|
||||
# Load and preprocess the dataset
|
||||
@ -80,21 +80,26 @@ def main(args):
|
||||
"seed": 42,
|
||||
"dataset_text_field": "text",
|
||||
"packing": False,
|
||||
"max_seq_length": args.seq_length,
|
||||
"max_length": args.seq_length,
|
||||
"torch_compile": False,
|
||||
"report_to": "none",
|
||||
"logging_dir": args.log_dir,
|
||||
"logging_steps": args.logging_steps
|
||||
}
|
||||
|
||||
# Warmup for torch compile
|
||||
model = torch.compile(model)
|
||||
SFTTrainer(
|
||||
model=model,
|
||||
processing_class=tokenizer,
|
||||
train_dataset=dataset,
|
||||
args=SFTConfig(**config),
|
||||
).train()
|
||||
# Compile model if requested
|
||||
if args.use_torch_compile:
|
||||
print("Compiling model with torch.compile()...")
|
||||
model = torch.compile(model)
|
||||
|
||||
# Warmup for torch compile
|
||||
print("Running warmup for torch.compile()...")
|
||||
SFTTrainer(
|
||||
model=model,
|
||||
processing_class=tokenizer,
|
||||
train_dataset=dataset,
|
||||
args=SFTConfig(**config),
|
||||
).train()
|
||||
|
||||
# Train the model
|
||||
print(f"\nStarting LoRA fine-tuning for {args.num_epochs} epoch(s)...")
|
||||
@ -105,6 +110,7 @@ def main(args):
|
||||
processing_class=tokenizer,
|
||||
train_dataset=dataset,
|
||||
args=SFTConfig(**config),
|
||||
peft_config=peft_config,
|
||||
)
|
||||
|
||||
trainer_stats = trainer.train()
|
||||
@ -155,6 +161,9 @@ def parse_arguments():
|
||||
help="Log every N steps")
|
||||
parser.add_argument("--log_dir", type=str, default="logs",
|
||||
help="Directory for logs")
|
||||
# Compilation
|
||||
parser.add_argument("--use_torch_compile", action="store_true",
|
||||
help="Use torch.compile() for faster training")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
@ -171,6 +180,7 @@ if __name__ == "__main__":
|
||||
print(f"Learning rate: {args.learning_rate}")
|
||||
print(f"LoRA rank: {args.lora_rank}")
|
||||
print(f"Dataset size: {args.dataset_size}")
|
||||
print(f"Torch compile: {args.use_torch_compile}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
main(args)
|
||||
main(args)
|
||||
|
||||
@ -0,0 +1,32 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
distributed_type: FSDP
|
||||
downcast_bf16: 'no'
|
||||
enable_cpu_affinity: false
|
||||
fsdp_config:
|
||||
fsdp_activation_checkpointing: false
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_offload_params: false
|
||||
fsdp_reshard_after_forward: false
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_transformer_layer_cls_to_wrap: 'LlamaDecoderLayer'
|
||||
fsdp_version: 2
|
||||
machine_rank: 0
|
||||
main_process_ip: < TODO: specify IP >
|
||||
main_process_port: < TODO: specify port >
|
||||
main_training_function: main
|
||||
mixed_precision: 'bf16'
|
||||
num_machines: 2
|
||||
num_processes: 2
|
||||
parallelism_config:
|
||||
parallelism_config_cp_size: 1
|
||||
parallelism_config_dp_replicate_size: 1
|
||||
parallelism_config_dp_shard_size: 2
|
||||
parallelism_config_tp_size: 1
|
||||
rdzv_backend: c10d
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
@ -0,0 +1,28 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
distributed_type: FSDP
|
||||
downcast_bf16: 'no'
|
||||
enable_cpu_affinity: false
|
||||
fsdp_config:
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_backward_prefetch: BACKWARD_PRE
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_forward_prefetch: false
|
||||
fsdp_offload_params: false
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
fsdp_state_dict_type: SHARDED_STATE_DICT
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_use_orig_params: true
|
||||
machine_rank: 0
|
||||
main_process_ip: < TODO: specify IP >
|
||||
main_process_port: < TODO: specify port >
|
||||
main_training_function: main
|
||||
mixed_precision: 'bf16'
|
||||
num_machines: 2
|
||||
num_processes: 2
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
48
nvidia/pytorch-fine-tune/assets/docker-compose.yml
Normal file
48
nvidia/pytorch-fine-tune/assets/docker-compose.yml
Normal file
@ -0,0 +1,48 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
finetunine:
|
||||
image: nvcr.io/nvidia/pytorch:25.10-py3
|
||||
deploy:
|
||||
replicas: 2
|
||||
restart_policy:
|
||||
condition: any
|
||||
delay: 5s
|
||||
max_attempts: 3
|
||||
window: 120s
|
||||
resources:
|
||||
reservations:
|
||||
generic_resources:
|
||||
- discrete_resource_spec:
|
||||
kind: 'NVIDIA_GPU'
|
||||
value: 1
|
||||
environment:
|
||||
- UCX_NET_DEVICES=enp1s0f1np1
|
||||
- NCCL_SOCKET_IFNAME=enp1s0f1np1
|
||||
- NCCL_DEBUG=INFO
|
||||
- TORCH_NCCL_ASYNC_ERROR_HANDLING=1
|
||||
- TORCH_DISTRIBUTED_DEBUG=DETAIL
|
||||
- GLOO_SOCKET_IFNAME=enp1s0f1np1
|
||||
- CUDA_DEVICE_MAX_CONNECTIONS=1
|
||||
- CUDA_VISIBLE_DEVICES=0
|
||||
entrypoint: /opt/pytorch-ft-entrypoint.sh
|
||||
volumes:
|
||||
- ${PWD}:/workspace
|
||||
- ${PWD}/pytorch-ft-entrypoint.sh:/opt/pytorch-ft-entrypoint.sh
|
||||
- ~/.cache/huggingface/:/root/.cache/huggingface/
|
||||
- ~/.ssh:/tmp/.ssh:ro
|
||||
ulimits:
|
||||
memlock: -1
|
||||
stack: 67108864
|
||||
networks:
|
||||
- host
|
||||
healthcheck:
|
||||
test: ["CMD", "service", "ssh", "status"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 10
|
||||
|
||||
networks:
|
||||
host:
|
||||
name: host
|
||||
external: true
|
||||
18
nvidia/pytorch-fine-tune/assets/install-requirements
Executable file
18
nvidia/pytorch-fine-tune/assets/install-requirements
Executable file
@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
pip install "transformers>=4.57.1" peft datasets "trl>=0.25.1" "bitsandbytes>=0.48.2" "torchao==0.13.0"
|
||||
62
nvidia/pytorch-fine-tune/assets/pytorch-ft-entrypoint.sh
Executable file
62
nvidia/pytorch-fine-tune/assets/pytorch-ft-entrypoint.sh
Executable file
@ -0,0 +1,62 @@
|
||||
#!/bin/env bash
|
||||
#
|
||||
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
SSH_PORT="${SSH_PORT:-2233}"
|
||||
|
||||
# Install and configure OpenSSH server
|
||||
apt-get update && \
|
||||
apt-get install -y openssh-server && \
|
||||
mkdir -p /var/run/sshd
|
||||
|
||||
ls -lha /tmp/.ssh
|
||||
cp -R /tmp/.ssh /root/
|
||||
ls -lha /root/.ssh
|
||||
chown -R $USER: /root/.ssh
|
||||
chmod 700 /root/.ssh
|
||||
chmod 600 /root/.ssh/*
|
||||
if compgen -G "/root/.ssh/*.pub" > /dev/null; then
|
||||
chmod 644 /root/.ssh/*.pub
|
||||
fi
|
||||
|
||||
|
||||
# Allow root login and key-based auth, move port to 2233
|
||||
sed -i.bak \
|
||||
-e 's/^#\?\s*PermitRootLogin\s.*/PermitRootLogin yes/' \
|
||||
-e 's/^#\?\s*PubkeyAuthentication\s.*/PubkeyAuthentication yes/' \
|
||||
-e 's/^#\?\s*Port\s\+22\s*$/Port '$SSH_PORT'/' \
|
||||
/etc/ssh/sshd_config
|
||||
|
||||
# Set root password
|
||||
echo "root:root" | chpasswd
|
||||
|
||||
# Configure SSH client for root to disable host key checks within *
|
||||
echo -e '\nHost *\n StrictHostKeyChecking no\n Port '$SSH_PORT'\n UserKnownHostsFile=/dev/null' > /etc/ssh/ssh_config.d/pyt-ft.conf && \
|
||||
chmod 600 /etc/ssh/ssh_config.d/pyt-ft.conf
|
||||
|
||||
# Fix login session for container
|
||||
sed 's@session\\s*required\\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
|
||||
|
||||
|
||||
# Start SSHD
|
||||
echo "Starting SSH"
|
||||
exec /usr/sbin/sshd -D
|
||||
sshd_rc = $?
|
||||
echo "Failed to start SSHD, rc $sshd_rc"
|
||||
exit $sshd_rc
|
||||
22
nvidia/pytorch-fine-tune/assets/run-multi-llama_3b
Executable file
22
nvidia/pytorch-fine-tune/assets/run-multi-llama_3b
Executable file
@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
docker exec \
|
||||
-e HF_TOKEN=$HF_TOKEN \
|
||||
-it $FINETUNING_CONTAINER bash -c '
|
||||
bash /workspace/install-requirements;
|
||||
accelerate launch --config_file=/workspace/configs/config_finetuning.yaml /workspace/Llama3_3B_full_finetuning.py'
|
||||
22
nvidia/pytorch-fine-tune/assets/run-multi-llama_70b
Executable file
22
nvidia/pytorch-fine-tune/assets/run-multi-llama_70b
Executable file
@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
docker exec \
|
||||
-e HF_TOKEN=$HF_TOKEN \
|
||||
-it $FINETUNING_CONTAINER bash -c '
|
||||
bash /workspace/install-requirements;
|
||||
accelerate launch --config_file=/workspace/configs/config_fsdp_lora.yaml /workspace/Llama3_70B_LoRA_finetuning.py'
|
||||
22
nvidia/pytorch-fine-tune/assets/run-multi-llama_8b
Executable file
22
nvidia/pytorch-fine-tune/assets/run-multi-llama_8b
Executable file
@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
docker exec \
|
||||
-e HF_TOKEN=$HF_TOKEN \
|
||||
-it $FINETUNING_CONTAINER bash -c '
|
||||
bash /workspace/install-requirements;
|
||||
accelerate launch --config_file=/workspace/configs/config_fsdp_lora.yaml /workspace/Llama3_8B_LoRA_finetuning.py'
|
||||
Loading…
Reference in New Issue
Block a user