diff --git a/components/runpod_trainer/Dockerfile b/components/runpod_trainer/Dockerfile index 19e37bf..4cf7fa2 100644 --- a/components/runpod_trainer/Dockerfile +++ b/components/runpod_trainer/Dockerfile @@ -1,4 +1,4 @@ -FROM runpod/pytorch:2.1.0-py3.10-cuda11.8.0-devel-ubuntu22.04 +FROM runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04 WORKDIR /app diff --git a/components/runpod_trainer/requirements.txt b/components/runpod_trainer/requirements.txt index 90f528e..310f39f 100644 --- a/components/runpod_trainer/requirements.txt +++ b/components/runpod_trainer/requirements.txt @@ -1,9 +1,8 @@ -runpod>=1.6.0 -transformers>=4.36.0 +runpod>=1.7.0 +transformers==4.44.0 datasets>=2.16.0 -accelerate>=0.25.0 +accelerate>=0.30.0 boto3>=1.34.0 scikit-learn>=1.3.0 scipy>=1.11.0 -torch>=2.1.0 safetensors>=0.4.0 diff --git a/pipelines/ddi_data_prep.py b/pipelines/ddi_data_prep.py index e67a313..596f2a2 100644 --- a/pipelines/ddi_data_prep.py +++ b/pipelines/ddi_data_prep.py @@ -178,6 +178,7 @@ def ddi_data_prep_pipeline( model_name: str = "emilyalsentzer/Bio_ClinicalBERT", epochs: int = 3, learning_rate: float = 2e-5, + # NOTE: Using internal endpoint. For Tailscale, add ACL: tag:k8s → tagged-devices:* minio_endpoint: str = "http://minio.minio.svc.cluster.local:9000", ): """ diff --git a/pipelines/ddi_training_runpod.py b/pipelines/ddi_training_runpod.py index fa41acd..bc75c68 100644 --- a/pipelines/ddi_training_runpod.py +++ b/pipelines/ddi_training_runpod.py @@ -198,8 +198,8 @@ def ddi_training_pipeline( learning_rate: float = 2e-5, model_version: str = "v1", - # MinIO settings - use Tailscale endpoint - minio_endpoint: str = "https://minio.walleye-frog.ts.net", + # MinIO settings - internal for now. For Tailscale, add ACL: tag:k8s → tagged-devices:* + minio_endpoint: str = "http://minio.minio.svc.cluster.local:9000", ): """ Full DDI training pipeline: