From 3a001a2e766340c18c116427cb46aab8bbb5410c Mon Sep 17 00:00:00 2001
From: GitLab CI <automaton@nvidia.com>
Date: Thu, 15 Jan 2026 18:12:01 +0000
Subject: [PATCH] chore: Regenerate all playbooks

---
 nvidia/nemo-fine-tune/README.md               | 33 ++++-----
 nvidia/pytorch-fine-tune/README.md            | 73 +++++++++++++++++--
 .../assets/Llama3_70B_LoRA_finetuning.py      |  4 +-
 .../assets/Llama3_8B_LoRA_finetuning.py       |  5 +-
 4 files changed, 88 insertions(+), 27 deletions(-)

diff --git a/nvidia/nemo-fine-tune/README.md b/nvidia/nemo-fine-tune/README.md
index 80ee9fd..b06467b 100644
--- a/nvidia/nemo-fine-tune/README.md
+++ b/nvidia/nemo-fine-tune/README.md
@@ -47,9 +47,8 @@ All necessary files for the playbook can be found [here on GitHub](https://githu
 * **Duration:** 45-90 minutes for complete setup and initial model fine-tuning
 * **Risks:** Model downloads can be large (several GB), ARM64 package compatibility issues may require troubleshooting, distributed training setup complexity increases with multi-node configurations
 * **Rollback:** Virtual environments can be completely removed; no system-level changes are made to the host system beyond package installations.
-* **Last Updated:** 12/22/2025
-  * Upgrade to latest pytorch container version nvcr.io/nvidia/pytorch:25.11-py3
-  * Add docker container permission setup instructioins
+* **Last Updated:** 01/15/2026
+  * Fix qLoRA fine-tuning workflow
 
 ## Instructions
 
@@ -69,6 +68,13 @@ nvidia-smi
 
 ## Check available system memory
 free -h
+
+## Docker permission:
+docker ps
+
+## if there is permission issue, (e.g., permission denied while trying to connect to the Docker daemon socket), then do:
+sudo usermod -aG docker $USER
+newgrp docker
 ```
 
 ## Step 2. Configure Docker permissions
@@ -94,13 +100,6 @@ newgrp docker
 docker pull nvcr.io/nvidia/pytorch:25.11-py3
 ```
 
-If you see a permission denied error (something like permission denied while trying to connect to the Docker daemon socket), add your user to the docker group so that you don't need to run the command with sudo .
-
-```bash
-sudo usermod -aG docker $USER
-newgrp docker
-```
-
 ## Step 4. Launch Docker
 
 ```bash
@@ -239,7 +238,7 @@ export HF_TOKEN=<your_huggingface_token>
 > - Request and receive access on each model's page (and accept license/terms) before attempting downloads.
 >   - Llama-3.1-8B: [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)
 >   - Qwen3-8B: [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B)
->   - Mixtral-8x7B: [mistralai/Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B)
+>   - Meta-Llama-3-70B: [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
 >
 > The same steps apply for any other gated model you use: visit its model card on Hugging Face, request access, accept the license, and wait for approval.
 
@@ -255,13 +254,13 @@ examples/llm_finetune/finetune.py \
 -c examples/llm_finetune/llama3_2/llama3_2_1b_squad_peft.yaml \
 --model.pretrained_model_name_or_path meta-llama/Llama-3.1-8B \
 --packed_sequence.packed_sequence_size 1024 \
---step_scheduler.max_steps 100
+--step_scheduler.max_steps 20
 ```
 
 These overrides ensure the Llama-3.1-8B LoRA run behaves as expected:
 - `--model.pretrained_model_name_or_path`: selects the Llama-3.1-8B model to fine-tune from the Hugging Face model hub (weights fetched via your Hugging Face token).
 - `--packed_sequence.packed_sequence_size`: sets the packed sequence size to 1024 to enable packed sequence training.
-- `--step_scheduler.max_steps`: sets the maximum number of training steps. We set it to 100 for demonstation purposes, please adjust this based on your needs.
+- `--step_scheduler.max_steps`: sets the maximum number of training steps. We set it to 20 for demonstation purposes, please adjust this based on your needs.
 
 
 **QLoRA fine-tuning example:**
@@ -276,14 +275,14 @@ examples/llm_finetune/finetune.py \
 --loss_fn._target_ nemo_automodel.components.loss.te_parallel_ce.TEParallelCrossEntropy \
 --step_scheduler.local_batch_size 1 \
 --packed_sequence.packed_sequence_size 1024 \
---step_scheduler.max_steps 100
+--step_scheduler.max_steps 20
 ```
 
 These overrides ensure the 70B QLoRA run behaves as expected:
 - `--model.pretrained_model_name_or_path`: selects the 70B base model to fine-tune (weights fetched via your Hugging Face token).
 - `--loss_fn._target_`: uses the TransformerEngine-parallel cross-entropy loss variant compatible with tensor-parallel training for large LLMs.
 - `--step_scheduler.local_batch_size`: sets the per-GPU micro-batch size to 1 to fit 70B in memory; overall effective batch size is still driven by gradient accumulation and data/tensor parallel settings from the recipe. 
-- `--step_scheduler.max_steps`: sets the maximum number of training steps. We set it to 100 for demonstation purposes, please adjust this based on your needs.
+- `--step_scheduler.max_steps`: sets the maximum number of training steps. We set it to 20 for demonstation purposes, please adjust this based on your needs.
 - `--packed_sequence.packed_sequence_size`: sets the packed sequence size to 1024 to enable packed sequence training.
 
 **Full Fine-tuning example:**
@@ -296,12 +295,12 @@ examples/llm_finetune/finetune.py \
 -c examples/llm_finetune/qwen/qwen3_8b_squad_spark.yaml \
 --model.pretrained_model_name_or_path Qwen/Qwen3-8B \
 --step_scheduler.local_batch_size 1 \
---step_scheduler.max_steps 100 \
+--step_scheduler.max_steps 20 \
 --packed_sequence.packed_sequence_size 1024
 ```
 These overrides ensure the Qwen3-8B SFT run behaves as expected:
 - `--model.pretrained_model_name_or_path`: selects the Qwen/Qwen3-8B model to fine-tune from the Hugging Face model hub (weights fetched via your Hugging Face token). Adjust this if you want to fine-tune a different model.
-- `--step_scheduler.max_steps`: sets the maximum number of training steps. We set it to 100 for demonstation purposes, please adjust this based on your needs.
+- `--step_scheduler.max_steps`: sets the maximum number of training steps. We set it to 20 for demonstation purposes, please adjust this based on your needs.
 - `--step_scheduler.local_batch_size`: sets the per-GPU micro-batch size to 1 to fit in memory; overall effective batch size is still driven by gradient accumulation and data/tensor parallel settings from the recipe.
 
 
diff --git a/nvidia/pytorch-fine-tune/README.md b/nvidia/pytorch-fine-tune/README.md
index 0860692..60140ac 100644
--- a/nvidia/pytorch-fine-tune/README.md
+++ b/nvidia/pytorch-fine-tune/README.md
@@ -40,7 +40,7 @@ By the end, you'll have a working installation that supports parameter-efficient
 
 
 ## Prerequisites
-Recipes are specifically for DIGITS SPARK. Please make sure that OS and drivers are latest.
+Recipes are specifically for DGX SPARK. Please make sure that OS and drivers are latest.
 
 
 ## Ancillary files
@@ -51,8 +51,9 @@ ALl files required for fine-tuning are included in the folder in [the GitHub rep
 
 * **Time estimate:** 30-45 mins for setup and runing fine-tuning. Fine-tuning run time varies depending on model size 
 * **Risks:** Model downloads can be large (several GB), ARM64 package compatibility issues may require troubleshooting.
-* **Last Updated:** 01/02/2025
+* **Last Updated:** 01/15/2025
   * Add two-Spark distributed finetuning example
+  * Add detailed instructions to run full SFT, LoRA and qLoRA workflows on Llama3 3B, 8B and 70B models.
 
 ## Instructions
 
@@ -111,15 +112,75 @@ cd dgx-spark-playbooks/nvidia/pytorch-fine-tune/assets
 
 ## Step7: Run the fine-tuning recipes
 
-To run LoRA on Llama3-8B use the following command:
+#### Available Fine-Tuning Scripts
+
+The following fine-tuning scripts are provided, each optimized for different model sizes and training approaches:
+
+| Script | Model | Fine-Tuning Type | Description |
+|--------|-------|------------------|-------------|
+| `Llama3_3B_full_finetuning.py` | Llama 3.2 3B | Full SFT | Full supervised fine-tuning (all parameters trainable) |
+| `Llama3_8B_LoRA_finetuning.py` | Llama 3.1 8B | LoRA | Low-Rank Adaptation (parameter-efficient) |
+| `Llama3_70B_LoRA_finetuning.py` | Llama 3.1 70B | LoRA | Low-Rank Adaptation with FSDP support |
+| `Llama3_70B_qLoRA_finetuning.py` | Llama 3.1 70B | QLoRA | Quantized LoRA (4-bit quantization for memory efficiency) |
+
+#### Basic Usage
+
+Run any script with default settings:
+
 ```bash
+## Full fine-tuning on Llama 3.2 3B
+python Llama3_3B_full_finetuning.py
+
+## LoRA fine-tuning on Llama 3.1 8B
 python Llama3_8B_LoRA_finetuning.py
+
+## LoRA fine-tuning on Llama 3.1 70B
+python Llama3_70B_LoRA_finetuning.py
 ```
 
-To run full fine-tuning on llama3-3B use the following command:
+#### Common Command-Line Arguments
+
+All scripts support the following command-line arguments for customization:
+
+##### Model Configuration
+- `--model_name`: Model name or path (default: varies by script)
+- `--dtype`: Model precision - `float32`, `float16`, or `bfloat16` (default: `bfloat16`)
+
+##### Training Configuration
+- `--batch_size`: Per-device training batch size (default: varies by script)
+- `--seq_length`: Maximum sequence length (default: `2048`)
+- `--num_epochs`: Number of training epochs (default: `1`)
+- `--gradient_accumulation_steps`: Gradient accumulation steps (default: `1`)
+- `--learning_rate`: Learning rate (default: varies by script)
+- `--gradient_checkpointing`: Enable gradient checkpointing to save memory (flag)
+
+##### LoRA Configuration (LoRA and QLoRA scripts only)
+- `--lora_rank`: LoRA rank - higher values = more trainable parameters (default: `8`)
+
+##### Dataset Configuration
+- `--dataset_size`: Number of samples to use from the Alpaca dataset (default: `500`)
+
+##### Logging Configuration
+- `--logging_steps`: Log metrics every N steps (default: `1`)
+- `--log_dir`: Directory for TensorBoard logs (default: `logs`)
+
+##### Model Saving
+- `--output_dir`: Directory to save the fine-tuned model (default: `None` - model not saved)
+
+##### Performance Optimization
+- `--use_torch_compile`: Enable `torch.compile()` for faster training (flag)
+
+> [!WARNING]
+> **Important:** The `--use_torch_compile` flag is **not compatible with QLoRA** (`Llama3_70B_qLoRA_finetuning.py`). 
+> Only use this flag with full fine-tuning and standard LoRA scripts.
+
+#### Usage Examples
 ```bash
-python Llama3_3B_full_finetuning.py
-```
+python Llama3_8B_LoRA_finetuning.py \
+  --dataset_size 100 \
+  --num_epochs 1 \
+  --batch_size 2
+  ```
 
 ## Run on two Sparks
 
diff --git a/nvidia/pytorch-fine-tune/assets/Llama3_70B_LoRA_finetuning.py b/nvidia/pytorch-fine-tune/assets/Llama3_70B_LoRA_finetuning.py
index b8731df..05f3aa4 100644
--- a/nvidia/pytorch-fine-tune/assets/Llama3_70B_LoRA_finetuning.py
+++ b/nvidia/pytorch-fine-tune/assets/Llama3_70B_LoRA_finetuning.py
@@ -77,6 +77,7 @@ def main(args):
         lora_dropout=0,
         task_type=TaskType.CAUSAL_LM
     )
+    model = get_peft_model(model, peft_config)
     
     trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
     total_params = sum(p.numel() for p in model.parameters())
@@ -130,7 +131,6 @@ def main(args):
         processing_class=tokenizer,
         train_dataset=dataset,
         args=SFTConfig(**config),
-        peft_config=peft_config,
     )
     
     trainer_stats = trainer.train()
@@ -220,4 +220,4 @@ if __name__ == "__main__":
     print(f"Torch compile: {args.use_torch_compile}")
     print(f"{'='*60}\n")
     
-    main(args)
+    main(args)
\ No newline at end of file
diff --git a/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py b/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py
index a0e68ed..f9a1e8e 100644
--- a/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py
+++ b/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py
@@ -62,6 +62,8 @@ def main(args):
         lora_alpha=16,
         lora_dropout=0,
         task_type=TaskType.CAUSAL_LM)
+    model = get_peft_model(model, peft_config)
+
     print(f"Trainable parameters = {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
 
     # Load and preprocess the dataset
@@ -110,7 +112,6 @@ def main(args):
         processing_class=tokenizer,
         train_dataset=dataset,
         args=SFTConfig(**config),
-        peft_config=peft_config,
     )
     
     trainer_stats = trainer.train()
@@ -183,4 +184,4 @@ if __name__ == "__main__":
     print(f"Torch compile: {args.use_torch_compile}")
     print(f"{'='*60}\n")
     
-    main(args)
+    main(args)
\ No newline at end of file