Skip to main content
Early Access FeatureFull parameter RL tuning is currently in private preview and available to select customers. Join the waitlist to request access.
Full parameter RL tuning is designed for teams that need maximum control over reinforcement learning updates. Unlike LoRA-based RFT, this mode updates all model weights (loraRank=0) while keeping a familiar Tinker-style training loop. Current preview scope is reinforcement training via RLOR trainer jobs.
Service-mode RLOR trainers currently support full-parameter tuning only. If serviceMode=true, set trainingConfig.loraRank (or SDK lora_rank) to 0; values greater than 0 are rejected.

What this unlocks

  • Custom RL objectives: Implement GRPO, DPO, PPO, or custom reward shaping logic in Python
  • Tinker-compatible primitives: Use forward(), forward_backward_custom(), and optim_step() directly
  • Service-mode trainers: Run the trainer as an API service and iterate quickly from your own script
  • Checkpoint-to-serving path: Save checkpoints and optionally hot-load them into inference deployments
If LoRA-based RFT already meets your quality and latency targets, start there first. Use full parameter tuning when LoRA quality saturates or you need full-weight updates for your use case.

End-to-end workflow

The workflow below reflects the latest cookbook-style setup: create serving infrastructure first, create the RLOR trainer, then connect with tinker.ServiceClient. If you want the same flow in one file, jump to Single-file starter script.
1

1) Create an inference deployment (Fireworks SDK)

from fireworks.client import LLM

base_model = "accounts/fireworks/models/kimi-k2-5-instruct"
deployment_id = "fp-rft-serving"

# This deployment can be used as the optional hot-load target.
serving_llm = LLM(
    model=base_model,
    id=deployment_id,
    deployment_type="on-demand",
    min_replica_count=0,
    max_replica_count=1,
)
serving_llm.apply()
2

2) Create an RLOR trainer job in service mode

import requests

api_key = "fw_..."
account_id = "your-account-id"
api_base = "https://api.fireworks.ai/v1"

payload = {
    "displayName": "fp-rft-trainer",
    "serviceMode": True,
    "hotLoadDeploymentId": deployment_id,  # optional but recommended
    "trainingConfig": {
        "baseModel": base_model,
        "loraRank": 0,  # 0 means full parameter updates
        "learningRate": 1e-5,
        "maxContextLength": 4096,
        "gradientAccumulationSteps": 4,
    },
}

resp = requests.post(
    f"{api_base}/accounts/{account_id}/rlorTrainerJobs",
    headers={"Authorization": f"Bearer {api_key}"},
    json=payload,
    timeout=60,
)
resp.raise_for_status()
job = resp.json()
job_id = job["name"].split("/")[-1]
3

3) Wait for trainer readiness, then connect via Tinker

import time
import requests
import tinker

while True:
    job_resp = requests.get(
        f"{api_base}/accounts/{account_id}/rlorTrainerJobs/{job_id}",
        headers={"Authorization": f"Bearer {api_key}"},
        timeout=30,
    )
    job_resp.raise_for_status()
    job = job_resp.json()

    state = job.get("state")
    trainer_url = job.get("directRouteHandle")

    if state == "JOB_STATE_RUNNING" and trainer_url:
        break
    if state in {"JOB_STATE_FAILED", "JOB_STATE_CANCELLED", "JOB_STATE_EXPIRED"}:
        raise RuntimeError(f"Trainer ended in state={state}")

    time.sleep(10)

service = tinker.ServiceClient(base_url=trainer_url, api_key=api_key)
training_client = service.create_lora_training_client(
    base_model=base_model,
    rank=0,
)
4

4) Run your custom loop and save checkpoints

import tinker

def your_loss_fn(data, logprobs_list):
    # Plug in your GRPO / DPO / PPO objective here.
    # Return: (loss_tensor, metrics_dict)
    loss = compute_custom_loss(logprobs_list)
    return loss, {"loss": float(loss.item())}

for step, batch in enumerate(training_batches):
    training_client.forward_backward_custom(batch, your_loss_fn).result()

    if (step + 1) % 4 == 0:  # match gradientAccumulationSteps
        training_client.optim_step(
            tinker.AdamParams(
                learning_rate=1e-5,
                beta1=0.9,
                beta2=0.999,
                eps=1e-8,
                weight_decay=0.01,
            )
        ).result()

checkpoint = training_client.save_weights_for_sampler("checkpoint_step_100").result()
print("Saved checkpoint:", checkpoint.path)

Single-file starter script

If you prefer one script with all setup steps in one place, start from this template and replace the placeholder dataset/loss logic:
#!/usr/bin/env python3
"""Single-file starter for full parameter RL tuning with Tinker."""

import os
import time
import requests
import tinker
from fireworks.client import LLM

API_BASE = "https://api.fireworks.ai/v1"
API_KEY = os.environ["FIREWORKS_API_KEY"]
ACCOUNT_ID = os.environ["FIREWORKS_ACCOUNT_ID"]
BASE_MODEL = "accounts/fireworks/models/kimi-k2-5-instruct"
DEPLOYMENT_ID = "fp-rft-serving"


def auth_headers():
    return {"Authorization": f"Bearer {API_KEY}"}


def ensure_inference_deployment() -> None:
    llm = LLM(
        model=BASE_MODEL,
        id=DEPLOYMENT_ID,
        deployment_type="on-demand",
        min_replica_count=0,
        max_replica_count=1,
    )
    llm.apply()


def create_rlor_service_job() -> str:
    payload = {
        "displayName": "fp-rft-trainer",
        "serviceMode": True,
        "hotLoadDeploymentId": DEPLOYMENT_ID,
        "trainingConfig": {
            "baseModel": BASE_MODEL,
            "loraRank": 0,
            "learningRate": 1e-5,
            "maxContextLength": 4096,
            "gradientAccumulationSteps": 4,
        },
    }
    resp = requests.post(
        f"{API_BASE}/accounts/{ACCOUNT_ID}/rlorTrainerJobs",
        headers=auth_headers(),
        json=payload,
        timeout=60,
    )
    resp.raise_for_status()
    return resp.json()["name"].split("/")[-1]


def wait_for_trainer_url(job_id: str) -> str:
    while True:
        resp = requests.get(
            f"{API_BASE}/accounts/{ACCOUNT_ID}/rlorTrainerJobs/{job_id}",
            headers=auth_headers(),
            timeout=30,
        )
        resp.raise_for_status()
        job = resp.json()
        state = job.get("state")
        trainer_url = job.get("directRouteHandle")

        if state == "JOB_STATE_RUNNING" and trainer_url:
            return trainer_url
        if state in {"JOB_STATE_FAILED", "JOB_STATE_CANCELLED", "JOB_STATE_EXPIRED"}:
            raise RuntimeError(f"trainer failed in state={state}")
        time.sleep(10)


def custom_loss_fn(data, logprobs_list):
    # Replace with your GRPO / DPO / PPO objective.
    loss = compute_custom_loss(logprobs_list)
    return loss, {"loss": float(loss.item())}


def compute_custom_loss(logprobs_list):
    raise NotImplementedError("Implement your RL loss (GRPO/DPO/PPO/custom).")


def build_training_batches():
    raise NotImplementedError("Build and return iterable batches of tinker.Datum objects.")


def main() -> None:
    ensure_inference_deployment()
    job_id = create_rlor_service_job()
    trainer_url = wait_for_trainer_url(job_id)

    service = tinker.ServiceClient(base_url=trainer_url, api_key=API_KEY)
    training_client = service.create_lora_training_client(base_model=BASE_MODEL, rank=0)

    # Build your own list of tinker.Datum batches.
    training_batches = build_training_batches()

    for step, batch in enumerate(training_batches):
        training_client.forward_backward_custom(batch, custom_loss_fn).result()
        if (step + 1) % 4 == 0:
            training_client.optim_step(tinker.AdamParams(learning_rate=1e-5)).result()

    ckpt = training_client.save_weights_for_sampler("checkpoint_step_final").result()
    print("checkpoint path:", ckpt.path)


if __name__ == "__main__":
    main()

Architecture

You control: Data prep, reward/loss logic, sampling strategy, and experiment tracking. Fireworks handles: Distributed trainer orchestration, service endpoint management, checkpoint persistence, and deployment integration.