"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"video_path = get_video_path(dataset_path, train_dataset[0]['video'])\n",
"display(load_video(video_path)[0])"
@@ -3113,7 +359,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": null,
"id": "8f8e4c46",
"metadata": {
"lines_to_next_cell": 2
@@ -3253,15 +499,7 @@
"metadata": {
"lines_to_next_cell": 2
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\"caption\": \"The video shows a stationary vehicle at an urban intersection under a bridge. There are no significant movements or interactions with other vehicles or pedestrians, and the scene remains calm with clear weather conditions.\", \"event_type\": \"no_incident\", \"cause_of_risk\": [], \"presence_of_rule_violations\": [], \"intended_driving_action\": \"enter_road\", \"traffic_density\": \"low\", \"driving_setting\": \"intersection\", \"time_of_day\": \"day\", \"light_conditions\": \"Normal\", \"weather\": \"Clear\", \"scene\": \"Urban\"}\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# View a sample label to confirm it's in the format we want\n",
"sample = train_dataset[0]\n",
@@ -3437,7 +675,7 @@
},
"outputs": [],
"source": [
- "save_dir = \"eval-run-1\" # feel free to adjust\n",
+ "save_dir = \"../saved_model\"\n",
"\n",
"trainer = SFTTrainer(\n",
" model=model,\n",
@@ -3487,112 +725,10 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": null,
"id": "7ab9fa74",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 151645, 'pad_token_id': 151643}.\n",
- "2025-09-18 01:02:39,337 - INFO - vision_config is None. Initializing the InternVisionConfig with default values.\n",
- "2025-09-18 01:02:39,339 - INFO - llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).\n",
- "2025-09-18 01:02:39,339 - INFO - vision_select_layer: -1\n",
- "2025-09-18 01:02:39,340 - INFO - ps_version: v1\n",
- "2025-09-18 01:02:39,340 - INFO - min_dynamic_patch: 1\n",
- "2025-09-18 01:02:39,340 - INFO - max_dynamic_patch: 6\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Resuming from None\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/usr/local/lib/python3.12/dist-packages/torch/_dynamo/eval_frame.py:893: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
- " return fn(*args, **kwargs)\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " \n",
- "
\n",
- " [ 46/30480 35:50 < 413:15:07, 0.02 it/s, Epoch 0.04/30]\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | Step | \n",
- " Training Loss | \n",
- " Validation Loss | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2025-09-18 01:27:08,429 - INFO - vision_config is None. Initializing the InternVisionConfig with default values.\n",
- "2025-09-18 01:27:08,429 - INFO - llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).\n",
- "2025-09-18 01:27:08,430 - INFO - vision_select_layer: -1\n",
- "2025-09-18 01:27:08,430 - INFO - ps_version: v1\n",
- "2025-09-18 01:27:08,430 - INFO - min_dynamic_patch: 1\n",
- "2025-09-18 01:27:08,430 - INFO - max_dynamic_patch: 6\n",
- "2025-09-18 01:27:08,431 - INFO - vision_config is None. Initializing the InternVisionConfig with default values.\n",
- "2025-09-18 01:27:08,431 - INFO - llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).\n",
- "2025-09-18 01:27:08,431 - INFO - vision_select_layer: -1\n",
- "2025-09-18 01:27:08,431 - INFO - ps_version: v1\n",
- "2025-09-18 01:27:08,431 - INFO - min_dynamic_patch: 1\n",
- "2025-09-18 01:27:08,431 - INFO - max_dynamic_patch: 6\n",
- "2025-09-18 01:27:08,433 - INFO - vision_config is None. Initializing the InternVisionConfig with default values.\n",
- "2025-09-18 01:27:08,434 - INFO - llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).\n",
- "2025-09-18 01:27:08,434 - INFO - vision_select_layer: -1\n",
- "2025-09-18 01:27:08,434 - INFO - ps_version: v1\n",
- "2025-09-18 01:27:08,434 - INFO - min_dynamic_patch: 1\n",
- "2025-09-18 01:27:08,435 - INFO - max_dynamic_patch: 6\n",
- "/usr/local/lib/python3.12/dist-packages/torch/_dynamo/eval_frame.py:893: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
- " return fn(*args, **kwargs)\n"
- ]
- },
- {
- "ename": "KeyboardInterrupt",
- "evalue": "",
- "output_type": "error",
- "traceback": [
- "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
- "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
- "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[23]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m last_ckpt = get_last_checkpoint(save_dir)\n\u001b[32m 2\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mResuming from \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlast_ckpt\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[43mtrainer\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlast_ckpt\u001b[49m\u001b[43m)\u001b[49m\n",
- "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/lib/python3.12/dist-packages/transformers/trainer.py:2328\u001b[39m, in \u001b[36mTrainer.train\u001b[39m\u001b[34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[39m\n\u001b[32m 2326\u001b[39m hf_hub_utils.enable_progress_bars()\n\u001b[32m 2327\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m2328\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 2329\u001b[39m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m=\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2330\u001b[39m \u001b[43m \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m=\u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2331\u001b[39m \u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2332\u001b[39m \u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m=\u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2333\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
- "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/lib/python3.12/dist-packages/transformers/trainer.py:2672\u001b[39m, in \u001b[36mTrainer._inner_training_loop\u001b[39m\u001b[34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[39m\n\u001b[32m 2665\u001b[39m context = (\n\u001b[32m 2666\u001b[39m functools.partial(\u001b[38;5;28mself\u001b[39m.accelerator.no_sync, model=model)\n\u001b[32m 2667\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m i != \u001b[38;5;28mlen\u001b[39m(batch_samples) - \u001b[32m1\u001b[39m\n\u001b[32m 2668\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m.accelerator.distributed_type != DistributedType.DEEPSPEED\n\u001b[32m 2669\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m contextlib.nullcontext\n\u001b[32m 2670\u001b[39m )\n\u001b[32m 2671\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m context():\n\u001b[32m-> \u001b[39m\u001b[32m2672\u001b[39m tr_loss_step = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_items_in_batch\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2674\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[32m 2675\u001b[39m args.logging_nan_inf_filter\n\u001b[32m 2676\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available()\n\u001b[32m 2677\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m (torch.isnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch.isinf(tr_loss_step))\n\u001b[32m 2678\u001b[39m ):\n\u001b[32m 2679\u001b[39m \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[32m 2680\u001b[39m tr_loss = tr_loss + tr_loss / (\u001b[32m1\u001b[39m + \u001b[38;5;28mself\u001b[39m.state.global_step - \u001b[38;5;28mself\u001b[39m._globalstep_last_logged)\n",
- "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/lib/python3.12/dist-packages/trl/trainer/sft_trainer.py:872\u001b[39m, in \u001b[36mSFTTrainer.training_step\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 870\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mtraining_step\u001b[39m(\u001b[38;5;28mself\u001b[39m, *args, **kwargs):\n\u001b[32m 871\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m.maybe_activation_offload_context:\n\u001b[32m--> \u001b[39m\u001b[32m872\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
- "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/lib/python3.12/dist-packages/transformers/trainer.py:4060\u001b[39m, in \u001b[36mTrainer.training_step\u001b[39m\u001b[34m(***failed resolving arguments***)\u001b[39m\n\u001b[32m 4057\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.accelerator.distributed_type == DistributedType.DEEPSPEED:\n\u001b[32m 4058\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33mscale_wrt_gas\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m4060\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43maccelerator\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43mloss\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 4062\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m loss.detach()\n",
- "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/lib/python3.12/dist-packages/accelerate/accelerator.py:2734\u001b[39m, in \u001b[36mAccelerator.backward\u001b[39m\u001b[34m(self, loss, **kwargs)\u001b[39m\n\u001b[32m 2732\u001b[39m \u001b[38;5;28mself\u001b[39m.lomo_backward(loss, learning_rate)\n\u001b[32m 2733\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m2734\u001b[39m \u001b[43mloss\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
- "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/lib/python3.12/dist-packages/torch/_tensor.py:648\u001b[39m, in \u001b[36mTensor.backward\u001b[39m\u001b[34m(self, gradient, retain_graph, create_graph, inputs)\u001b[39m\n\u001b[32m 638\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m 639\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[32m 640\u001b[39m Tensor.backward,\n\u001b[32m 641\u001b[39m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[32m (...)\u001b[39m\u001b[32m 646\u001b[39m inputs=inputs,\n\u001b[32m 647\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m648\u001b[39m \u001b[43mtorch\u001b[49m\u001b[43m.\u001b[49m\u001b[43mautograd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 649\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m=\u001b[49m\u001b[43minputs\u001b[49m\n\u001b[32m 650\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
- "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/lib/python3.12/dist-packages/torch/autograd/__init__.py:354\u001b[39m, in \u001b[36mbackward\u001b[39m\u001b[34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[39m\n\u001b[32m 349\u001b[39m retain_graph = create_graph\n\u001b[32m 351\u001b[39m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[32m 352\u001b[39m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[32m 353\u001b[39m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m354\u001b[39m \u001b[43m_engine_run_backward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 355\u001b[39m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 356\u001b[39m \u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 357\u001b[39m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 358\u001b[39m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 359\u001b[39m \u001b[43m \u001b[49m\u001b[43minputs_tuple\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 360\u001b[39m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 361\u001b[39m \u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 362\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
- "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/lib/python3.12/dist-packages/torch/autograd/graph.py:829\u001b[39m, in \u001b[36m_engine_run_backward\u001b[39m\u001b[34m(t_outputs, *args, **kwargs)\u001b[39m\n\u001b[32m 827\u001b[39m unregister_hooks = _register_logging_hooks_on_whole_graph(t_outputs)\n\u001b[32m 828\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m829\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mVariable\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_execution_engine\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[32m 830\u001b[39m \u001b[43m \u001b[49m\u001b[43mt_outputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m 831\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001b[39;00m\n\u001b[32m 832\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 833\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m attach_logging_hooks:\n",
- "\u001b[31mKeyboardInterrupt\u001b[39m: "
- ]
- }
- ],
+ "outputs": [],
"source": [
"last_ckpt = get_last_checkpoint(save_dir)\n",
"print(f\"Resuming from {last_ckpt}\")\n",