RunPod
HF_HOME=/workspace/.cache/huggingface
- PUBLIC_KEY
- for ssh
 
- JUPYTER_PASSWORD
- Pod
- vRAM 48G ~$0.5/hr
- vRAM 80G ~$2.0/hr
- Spot/抢占 50% off
 
apt update
apt install -y neofetch nvtop htop btop
pip install huggingface_hub
# for Gated Models
# ~/.git-credentials
git config --global credential.helper store
huggingface-cli login
huggingface-cli download google/gemma-3-12b-it
pip install vllm flash_attn accelerate
vllm serve "google/gemma-3-27b-it"
# 最新版 transformers
pip install https://github.com/huggingface/transformers/archive/refs/heads/main.zip
# / 为 Container Disk - 不会持久化
# /workspace 为 Volume Disk
df -h / /workspace
neofetch
nvidia-smi
# PDF to Images
apt-get install poppler-utils
# 文件-1.png
pdftoppm -png 文件{.pdf,}
# Ollama
# ==========
curl -fsSL https://ollama.com/install.sh | sh
ollama serve
ollama pull qwen2.5vl:32b
# FRP
# ==========
curl -LO https://github.com/fatedier/frp/releases/download/v0.62.1/frp_0.62.1_linux_amd64.tar.gz
tar -xzf frp_0.62.1_linux_amd64.tar.gz --strip-components=1 frp_0.62.1_linux_amd64/frpc
./frpc -v
!pip install vllm
#[]
import os
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
# Setting the environment variable as suggested
os.environ['VLLM_ATTENTION_BACKEND'] = 'FLASHINFER'
llm = LLM(model="google/gemma-3-27b-it", enable_lora=True)
import torch
from transformers import pipeline
pipe = pipeline(
    "image-text-to-text",
    model="google/gemma-3-4b-it", # "google/gemma-3-12b-it", "google/gemma-3-27b-it"
    device="cuda",
    torch_dtype=torch.bfloat16
)
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    }
]
output = pipe(text=messages, max_new_tokens=200)
print(output[0]["generated_text"][-1]["content"])