diff --git a/nvidia/pytorch-fine-tune/README.md b/nvidia/pytorch-fine-tune/README.md index 60140ac..3a9e5f9 100644 --- a/nvidia/pytorch-fine-tune/README.md +++ b/nvidia/pytorch-fine-tune/README.md @@ -134,8 +134,8 @@ python Llama3_3B_full_finetuning.py ## LoRA fine-tuning on Llama 3.1 8B python Llama3_8B_LoRA_finetuning.py -## LoRA fine-tuning on Llama 3.1 70B -python Llama3_70B_LoRA_finetuning.py +## qLoRA fine-tuning on Llama 3.1 70B +python Llama3_70B_qLoRA_finetuning.py ``` #### Common Command-Line Arguments @@ -158,7 +158,7 @@ All scripts support the following command-line arguments for customization: - `--lora_rank`: LoRA rank - higher values = more trainable parameters (default: `8`) ##### Dataset Configuration -- `--dataset_size`: Number of samples to use from the Alpaca dataset (default: `500`) +- `--dataset_size`: Number of samples to use from the Alpaca dataset (default: `512`) ##### Logging Configuration - `--logging_steps`: Log metrics every N steps (default: `1`) @@ -167,13 +167,6 @@ All scripts support the following command-line arguments for customization: ##### Model Saving - `--output_dir`: Directory to save the fine-tuned model (default: `None` - model not saved) -##### Performance Optimization -- `--use_torch_compile`: Enable `torch.compile()` for faster training (flag) - -> [!WARNING] -> **Important:** The `--use_torch_compile` flag is **not compatible with QLoRA** (`Llama3_70B_qLoRA_finetuning.py`). -> Only use this flag with full fine-tuning and standard LoRA scripts. - #### Usage Examples ```bash python Llama3_8B_LoRA_finetuning.py \ diff --git a/nvidia/pytorch-fine-tune/assets/Llama3_3B_full_finetuning.py b/nvidia/pytorch-fine-tune/assets/Llama3_3B_full_finetuning.py index b40a5ca..af6494e 100644 --- a/nvidia/pytorch-fine-tune/assets/Llama3_3B_full_finetuning.py +++ b/nvidia/pytorch-fine-tune/assets/Llama3_3B_full_finetuning.py @@ -31,7 +31,7 @@ ALPACA_PROMPT_TEMPLATE = """Below is an instruction that describes a task, paire ### Response: {}""" -def get_alpaca_dataset(eos_token, dataset_size=500): +def get_alpaca_dataset(eos_token, dataset_size=512): # Preprocess the dataset def preprocess(x): texts = [ @@ -69,7 +69,7 @@ def main(args): # Configure the SFT config config = { "per_device_train_batch_size": args.batch_size, - "num_train_epochs": 0.01, # Warmup epoch + "num_train_epochs": 0.05, # Warmup epoch "gradient_accumulation_steps": args.gradient_accumulation_steps, "learning_rate": args.learning_rate, "optim": "adamw_torch", @@ -79,26 +79,24 @@ def main(args): "dataset_text_field": "text", "packing": False, "max_length": args.seq_length, - "torch_compile": False, "report_to": "none", "logging_dir": args.log_dir, "logging_steps": args.logging_steps, "gradient_checkpointing": args.gradient_checkpointing, # Save memory } - # Compile model if requested - if args.use_torch_compile: - print("Compiling model with torch.compile()...") - model = torch.compile(model) + # Compile model for faster training + print("Compiling model with torch.compile()...") + model = torch.compile(model) - # Warmup for torch compile - print("Running warmup for torch.compile()...") - SFTTrainer( - model=model, - processing_class=tokenizer, - train_dataset=dataset, - args=SFTConfig(**config), - ).train() + # Warmup for torch compile + print("Running warmup for torch.compile()...") + SFTTrainer( + model=model, + processing_class=tokenizer, + train_dataset=dataset, + args=SFTConfig(**config), + ).train() # Train the model print(f"\nStarting full fine-tuning for {args.num_epochs} epoch(s)...") @@ -124,13 +122,6 @@ def main(args): print(f"Train loss: {trainer_stats.metrics['train_loss']:.4f}") print(f"{'='*60}\n") - # Save model if requested - if args.output_dir: - print(f"Saving model to {args.output_dir}...") - trainer.save_model(args.output_dir) - tokenizer.save_pretrained(args.output_dir) - print("Model saved successfully!") - def parse_arguments(): parser = argparse.ArgumentParser(description="Llama 3.2 3B Full Fine-tuning (SFT)") @@ -157,7 +148,7 @@ def parse_arguments(): help="Enable gradient checkpointing to save memory") # Dataset configuration - parser.add_argument("--dataset_size", type=int, default=500, + parser.add_argument("--dataset_size", type=int, default=512, help="Number of samples to use from dataset") # Logging configuration @@ -166,12 +157,6 @@ def parse_arguments(): parser.add_argument("--log_dir", type=str, default="logs", help="Directory for logs") - # Compilation and saving - parser.add_argument("--use_torch_compile", action="store_true", - help="Use torch.compile() for faster training") - parser.add_argument("--output_dir", type=str, default=None, - help="Directory to save the fine-tuned model") - return parser.parse_args() @@ -190,7 +175,6 @@ if __name__ == "__main__": print(f"Learning rate: {args.learning_rate}") print(f"Dataset size: {args.dataset_size}") print(f"Gradient checkpointing: {args.gradient_checkpointing}") - print(f"Torch compile: {args.use_torch_compile}") print(f"{'='*60}\n") main(args) diff --git a/nvidia/pytorch-fine-tune/assets/Llama3_70B_qLoRA_finetuning.py b/nvidia/pytorch-fine-tune/assets/Llama3_70B_qLoRA_finetuning.py index f6e001b..f507f4e 100644 --- a/nvidia/pytorch-fine-tune/assets/Llama3_70B_qLoRA_finetuning.py +++ b/nvidia/pytorch-fine-tune/assets/Llama3_70B_qLoRA_finetuning.py @@ -32,7 +32,7 @@ ALPACA_PROMPT_TEMPLATE = """Below is an instruction that describes a task, paire ### Response: {}""" -def get_alpaca_dataset(eos_token, dataset_size=500): +def get_alpaca_dataset(eos_token, dataset_size=512): # Preprocess the dataset def preprocess(x): texts = [ @@ -67,15 +67,14 @@ def main(args): args.model_name, quantization_config=quantization_config, dtype=args.dtype, - device_map=device_map_config, - trust_remote_code=True + device_map="cuda", ) - tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(args.model_name) tokenizer.pad_token = tokenizer.eos_token # Prepare model for QLoRA training print(f"Preparing model for QLoRA (4-bit) with rank {args.lora_rank}...") - # model = prepare_model_for_kbit_training(model) + model = prepare_model_for_kbit_training(model) peft_config = LoraConfig( r=args.lora_rank, @@ -96,7 +95,7 @@ def main(args): # Configure the SFT config config = { "per_device_train_batch_size": args.batch_size, - "num_train_epochs": 0.01, # Warmup epoch + "num_train_epochs": args.num_epochs, "gradient_accumulation_steps": args.gradient_accumulation_steps, "learning_rate": args.learning_rate, "optim": "adamw_torch", @@ -106,30 +105,14 @@ def main(args): "dataset_text_field": "text", "packing": False, "max_length": args.seq_length, - "torch_compile": False, "report_to": "none", "logging_dir": args.log_dir, "logging_steps": args.logging_steps, "gradient_checkpointing": args.gradient_checkpointing } - # Compile model if requested - if args.use_torch_compile: - print("Compiling model with torch.compile()...") - model = torch.compile(model) - - # Warmup for torch compile - print("Running warmup for torch.compile()...") - SFTTrainer( - model=model, - processing_class=tokenizer, - train_dataset=dataset, - args=SFTConfig(**config), - ).train() - # Train the model print(f"\nStarting QLoRA fine-tuning for {args.num_epochs} epoch(s)...") - config["num_train_epochs"] = args.num_epochs config["report_to"] = "tensorboard" trainer = SFTTrainer( @@ -164,7 +147,7 @@ def parse_arguments(): parser = argparse.ArgumentParser(description="Llama 3.1 70B Fine-tuning with QLoRA") # Model configuration - parser.add_argument("--model_name", type=str, default="meta-llama/Llama-3.1-70B-Instruct", + parser.add_argument("--model_name", type=str, default="unsloth/Meta-Llama-3.1-70B-bnb-4bit", help="Model name or path") parser.add_argument("--dtype", type=str, default="bfloat16", help="Model dtype (e.g., float32, float16, bfloat16)") @@ -190,7 +173,7 @@ def parse_arguments(): help="LoRA rank") # Dataset configuration - parser.add_argument("--dataset_size", type=int, default=500, + parser.add_argument("--dataset_size", type=int, default=512, help="Number of samples to use from dataset") # Logging configuration @@ -199,12 +182,6 @@ def parse_arguments(): parser.add_argument("--log_dir", type=str, default="logs", help="Directory for logs") - # Compilation and saving - parser.add_argument("--use_torch_compile", action="store_true", - help="Use torch.compile() for faster training") - parser.add_argument("--output_dir", type=str, default=None, - help="Directory to save the fine-tuned model") - return parser.parse_args() @@ -224,7 +201,6 @@ if __name__ == "__main__": print(f"LoRA rank: {args.lora_rank}") print(f"Dataset size: {args.dataset_size}") print(f"Gradient checkpointing: {args.gradient_checkpointing}") - print(f"Torch compile: {args.use_torch_compile}") print(f"{'='*60}\n") main(args) diff --git a/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py b/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py index 4023b84..6114594 100644 --- a/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py +++ b/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py @@ -31,7 +31,7 @@ ALPACA_PROMPT_TEMPLATE = """Below is an instruction that describes a task, paire ### Response: {}""" -def get_alpaca_dataset(eos_token, dataset_size=500): +def get_alpaca_dataset(eos_token, dataset_size=512): # Preprocess the dataset def preprocess(x): texts = [ @@ -83,25 +83,23 @@ def main(args): "dataset_text_field": "text", "packing": False, "max_length": args.seq_length, - "torch_compile": False, "report_to": "none", "logging_dir": args.log_dir, "logging_steps": args.logging_steps } - # Compile model if requested - if args.use_torch_compile: - print("Compiling model with torch.compile()...") - model = torch.compile(model) - - # Warmup for torch compile - print("Running warmup for torch.compile()...") - SFTTrainer( - model=model, - processing_class=tokenizer, - train_dataset=dataset, - args=SFTConfig(**config), - ).train() + # Compile model for faster training + print("Compiling model with torch.compile()...") + model = torch.compile(model) + + # Warmup for torch compile + print("Running warmup for torch.compile()...") + SFTTrainer( + model=model, + processing_class=tokenizer, + train_dataset=dataset, + args=SFTConfig(**config), + ).train() # Train the model print(f"\nStarting LoRA fine-tuning for {args.num_epochs} epoch(s)...") @@ -138,7 +136,7 @@ def parse_arguments(): help="Model dtype") # Training configuration - parser.add_argument("--batch_size", type=int, default=4, + parser.add_argument("--batch_size", type=int, default=8, help="Per device training batch size") parser.add_argument("--seq_length", type=int, default=2048, help="Maximum sequence length") @@ -154,7 +152,7 @@ def parse_arguments(): help="LoRA rank") # Dataset configuration - parser.add_argument("--dataset_size", type=int, default=500, + parser.add_argument("--dataset_size", type=int, default=512, help="Number of samples to use from dataset") # Logging configuration @@ -162,9 +160,6 @@ def parse_arguments(): help="Log every N steps") parser.add_argument("--log_dir", type=str, default="logs", help="Directory for logs") - # Compilation - parser.add_argument("--use_torch_compile", action="store_true", - help="Use torch.compile() for faster training") return parser.parse_args() @@ -181,7 +176,6 @@ if __name__ == "__main__": print(f"Learning rate: {args.learning_rate}") print(f"LoRA rank: {args.lora_rank}") print(f"Dataset size: {args.dataset_size}") - print(f"Torch compile: {args.use_torch_compile}") print(f"{'='*60}\n") main(args) \ No newline at end of file