> ## Documentation Index
> Fetch the complete documentation index at: https://docs.fireworks.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Create Reinforcement Fine-tuning Step



## OpenAPI

````yaml post /v1/accounts/{account_id}/rlorTrainerJobs
openapi: 3.1.0
info:
  title: Gateway REST API
  version: 4.259.0
servers:
  - url: https://api.fireworks.ai
security:
  - BearerAuth: []
tags:
  - name: Gateway
paths:
  /v1/accounts/{account_id}/rlorTrainerJobs:
    post:
      tags:
        - Gateway
      summary: Create Reinforcement Fine-tuning Step
      operationId: Gateway_CreateRlorTrainerJob
      parameters:
        - name: rlorTrainerJobId
          description: >-
            ID of the RLOR trainer job, a random UUID will be generated if not
            specified.
          in: query
          required: false
          schema:
            type: string
        - name: trainingShape
          description: >-
            Optional validated training-shape selector for service-mode
            launches.

            Accepted formats:

            - accounts/{account}/trainingShapes/{shape}

            - accounts/{account}/trainingShapes/{shape}/versions/{version}

            - accounts/{account}/trainingShapes/{shape}/versions/latest

            When a shape (without /versions/*) is provided, the latest validated
            version is used.
          in: query
          required: false
          schema:
            type: string
        - name: account_id
          in: path
          required: true
          description: The Account Id
          schema:
            type: string
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/gatewayRlorTrainerJob'
        required: true
      responses:
        '200':
          description: A successful response.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/gatewayRlorTrainerJob'
components:
  schemas:
    gatewayRlorTrainerJob:
      type: object
      properties:
        name:
          type: string
          readOnly: true
        displayName:
          type: string
        createTime:
          type: string
          format: date-time
          readOnly: true
        completedTime:
          type: string
          format: date-time
          readOnly: true
        dataset:
          type: string
          description: The name of the dataset used for training.
        evaluationDataset:
          type: string
          description: The name of a separate dataset to use for evaluation.
        evalAutoCarveout:
          type: boolean
          description: Whether to auto-carve the dataset for eval.
        state:
          $ref: '#/components/schemas/gatewayJobState'
          readOnly: true
        status:
          $ref: '#/components/schemas/gatewayStatus'
          readOnly: true
        createdBy:
          type: string
          description: The email address of the user who initiated this fine-tuning job.
          readOnly: true
        trainingConfig:
          $ref: '#/components/schemas/gatewayBaseTrainingConfig'
          description: Common training configurations.
        rewardWeights:
          type: array
          items:
            type: string
          description: >-
            A list of reward metrics to use for training in format of
            "<reward_name>=<weight>".
        wandbConfig:
          $ref: '#/components/schemas/gatewayWandbConfig'
          description: >-
            The Weights & Biases team/user account for logging training
            progress.
        awsS3Config:
          $ref: '#/components/schemas/gatewayAwsS3Config'
          description: The AWS configuration for S3 dataset access.
        azureBlobStorageConfig:
          $ref: '#/components/schemas/gatewayAzureBlobStorageConfig'
          description: The Azure configuration for Azure Blob Storage dataset access.
        jobProgress:
          $ref: '#/components/schemas/gatewayJobProgress'
          description: Job progress.
          readOnly: true
        keepAlive:
          type: boolean
          title: indicates this RLOR trainer job should run in keep-alive mode
        rolloutDeploymentName:
          type: string
          description: >-
            Rollout deployment name associated with this RLOR trainer job.

            This is optional. If not set, trainer will not trigger weight sync
            to rollout engine.
        lossConfig:
          $ref: '#/components/schemas/gatewayReinforcementLearningLossConfig'
          description: >-
            Reinforcement learning loss method + hyperparameters for the
            underlying trainer.
        nodeCount:
          type: integer
          format: int32
          description: |-
            The number of nodes to use for the fine-tuning job.
            If not specified, the default is 1.
        acceleratorSeconds:
          type: object
          additionalProperties:
            type: string
            format: int64
          description: >-
            Accelerator seconds used by the job, keyed by accelerator type
            (e.g., "NVIDIA_H100_80GB").

            Updated periodically.
          readOnly: true
        serviceMode:
          type: boolean
          title: >-
            Whether to deploy as a service with tinker-style api endpoints
            exposure
        directRouteHandle:
          type: string
          title: |-
            Only valid when service_mode enabled
            The direct route handle for the trainer in service mode (tinker api)
          readOnly: true
        hotLoadDeploymentId:
          type: string
          description: >-
            The deployment ID used for hot loading. When set, checkpoints are
            saved

            to this deployment's hot load bucket, enabling weight swaps on
            inference.

            Only valid for service-mode or keep-alive jobs.
        purpose:
          $ref: '#/components/schemas/gatewayPurpose'
          description: Scheduling purpose for this job.
        forwardOnly:
          type: boolean
          description: >-
            When true, run the trainer in forward-only mode (no
            backward/optimizer).

            Used for reference models in GRPO that only need forward passes.
        managedBy:
          type: string
          description: For managed service use only. Users do not need to set this field.
      title: 'Next ID: 36 (field 34 reserved for removed public_logs_signed_url)'
    gatewayJobState:
      type: string
      enum:
        - JOB_STATE_UNSPECIFIED
        - JOB_STATE_CREATING
        - JOB_STATE_RUNNING
        - JOB_STATE_COMPLETED
        - JOB_STATE_FAILED
        - JOB_STATE_CANCELLED
        - JOB_STATE_DELETING
        - JOB_STATE_WRITING_RESULTS
        - JOB_STATE_VALIDATING
        - JOB_STATE_DELETING_CLEANING_UP
        - JOB_STATE_PENDING
        - JOB_STATE_EXPIRED
        - JOB_STATE_RE_QUEUEING
        - JOB_STATE_CREATING_INPUT_DATASET
        - JOB_STATE_IDLE
        - JOB_STATE_CANCELLING
        - JOB_STATE_EARLY_STOPPED
        - JOB_STATE_PAUSED
        - JOB_STATE_DELETED
      default: JOB_STATE_UNSPECIFIED
      description: |-
        JobState represents the state an asynchronous job can be in.

         - JOB_STATE_PAUSED: Job is paused, typically due to account suspension or manual intervention.
         - JOB_STATE_DELETED: Job has been deleted.
    gatewayStatus:
      type: object
      properties:
        code:
          $ref: '#/components/schemas/gatewayCode'
          description: The status code.
        message:
          type: string
          description: A developer-facing error message in English.
      title: >-
        Mimics
        [https://github.com/googleapis/googleapis/blob/master/google/rpc/status.proto]
    gatewayBaseTrainingConfig:
      type: object
      properties:
        outputModel:
          type: string
          description: >-
            The model ID to be assigned to the resulting fine-tuned model. If
            not specified, the job ID will be used.
        baseModel:
          type: string
          description: |-
            The name of the base model to be fine-tuned
            Only one of 'base_model' or 'warm_start_from' should be specified.
        warmStartFrom:
          type: string
          description: |-
            The PEFT addon model in Fireworks format to be fine-tuned from
            Only one of 'base_model' or 'warm_start_from' should be specified.
        jinjaTemplate:
          type: string
          title: >-
            The Jinja template for conversation formatting. If not specified,
            defaults to the base model's conversation template configuration
        learningRate:
          type: number
          format: float
          description: The learning rate used for training.
        maxContextLength:
          type: integer
          format: int32
          description: The maximum context length to use with the model.
        loraRank:
          type: integer
          format: int32
          description: The rank of the LoRA layers.
        epochs:
          type: integer
          format: int32
          description: The number of epochs to train for.
        batchSize:
          type: integer
          format: int32
          description: >-
            The maximum packed number of tokens per batch for training in
            sequence packing.
        gradientAccumulationSteps:
          type: integer
          format: int32
          title: Number of gradient accumulation steps
        learningRateWarmupSteps:
          type: integer
          format: int32
          title: Number of steps for learning rate warm up
        batchSizeSamples:
          type: integer
          format: int32
          description: The number of samples per gradient batch.
        optimizerWeightDecay:
          type: number
          format: float
          description: Weight decay (L2 regularization) for optimizer.
        trainerShardingScheme:
          $ref: '#/components/schemas/gatewayTrainerShardingScheme'
          description: Structured trainer sharding/parallelism configuration.
        loraAlpha:
          type: integer
          format: int32
          description: |-
            LoRA alpha scaling factor.
            If not specified (or 0), trainer defaults are used.
        loraDropout:
          type: number
          format: float
          description: LoRA dropout probability.
        loraTargetModules:
          type: array
          items:
            type: string
          description: Optional LoRA target module names (e.g. q_proj, k_proj, v_proj).
      title: |-
        BaseTrainingConfig contains common configuration fields shared across
        different training job types.
    gatewayWandbConfig:
      type: object
      properties:
        enabled:
          type: boolean
          description: Whether to enable wandb logging.
        apiKey:
          type: string
          description: The API key for the wandb service.
        project:
          type: string
          description: The project name for the wandb service.
        entity:
          type: string
          description: The entity name for the wandb service.
        runId:
          type: string
          description: The run ID for the wandb service.
        url:
          type: string
          description: The URL for the wandb service.
          readOnly: true
      description: >-
        WandbConfig is the configuration for the Weights & Biases (wandb)
        logging which

        will be used by a training job.
    gatewayAwsS3Config:
      type: object
      properties:
        credentialsSecret:
          type: string
          title: >-
            Reference to a Secret resource containing AWS access key
            credentials.

            Format: accounts/{account_id}/secrets/{secret_id}

            The secret value must be JSON: {"aws_access_key_id": "AKIA...",
            "aws_secret_access_key": "..."}
        iamRoleArn:
          type: string
          title: >-
            IAM role ARN to assume for accessing S3 datasets via GCP OIDC
            federation.

            Format: arn:aws:iam::account-id:role/role-name
      description: |-
        AwsS3Config is the configuration for AWS S3 dataset access which
        will be used by a training job.
    gatewayAzureBlobStorageConfig:
      type: object
      properties:
        credentialsSecret:
          type: string
          description: >-
            Reference to a Secret resource containing Azure credentials.

            Format: accounts/{account_id}/secrets/{secret_id}

            The secret value must be JSON: {"connection_string": "..."} or
            {"sas_token": "..."} or {"account_key": "..."}

            Mutually exclusive with managed_identity_client_id.
        managedIdentityClientId:
          type: string
          description: >-
            Managed Identity Client ID for GCP-to-Azure Workload Identity
            Federation.

            Format: uuid

            Mutually exclusive with credentials_secret.
        tenantId:
          type: string
          title: |-
            Azure tenant ID for Workload Identity Federation.
            Format: uuid
      description: >-
        AzureBlobStorageConfig is the configuration for Azure Blob Storage
        dataset access

        which will be used by a training job.
    gatewayJobProgress:
      type: object
      properties:
        percent:
          type: integer
          format: int32
          description: Progress percent, within the range from 0 to 100.
        epoch:
          type: integer
          format: int32
          description: >-
            The epoch for which the progress percent is reported, usually
            starting from 0.

            This is optional for jobs that don't run in an epoch fasion, e.g.
            BIJ, EVJ.
        totalInputRequests:
          type: integer
          format: int32
          description: Total number of input requests/rows in the job.
        totalProcessedRequests:
          type: integer
          format: int32
          description: >-
            Total number of requests that have been processed (successfully or
            failed).
        successfullyProcessedRequests:
          type: integer
          format: int32
          description: Number of requests that were processed successfully.
        failedRequests:
          type: integer
          format: int32
          description: Number of requests that failed to process.
        outputRows:
          type: integer
          format: int32
          description: Number of output rows generated.
        inputTokens:
          type: integer
          format: int32
          description: Total number of input tokens processed.
        outputTokens:
          type: integer
          format: int32
          description: Total number of output tokens generated.
        cachedInputTokenCount:
          type: integer
          format: int32
          description: The number of input tokens that hit the prompt cache.
      description: Progress of a job, e.g. RLOR, EVJ, BIJ etc.
    gatewayReinforcementLearningLossConfig:
      type: object
      properties:
        method:
          $ref: '#/components/schemas/ReinforcementLearningLossConfigMethod'
        klBeta:
          type: number
          format: float
          description: |-
            KL coefficient (beta) override for GRPO-like methods.
            If unset, the trainer default is used.
        dpo:
          $ref: '#/components/schemas/gatewayDpoConfig'
          description: DPO-specific configuration. Intended for METHOD=DPO.
        orpo:
          $ref: '#/components/schemas/gatewayOrpoConfig'
          description: ORPO-specific configuration. Intended for METHOD=ORPO.
      description: >-
        Loss method + hyperparameters for reinforcement-learning-style
        fine-tuning (e.g. RFT / RL trainers).

        For preference jobs (DPO API), the default loss method is GRPO when
        METHOD_UNSPECIFIED.
    gatewayPurpose:
      type: string
      enum:
        - PURPOSE_UNSPECIFIED
        - PURPOSE_PILOT
      default: PURPOSE_UNSPECIFIED
      description: Scheduling purpose for training jobs and deployments.
    gatewayCode:
      type: string
      enum:
        - OK
        - CANCELLED
        - UNKNOWN
        - INVALID_ARGUMENT
        - DEADLINE_EXCEEDED
        - NOT_FOUND
        - ALREADY_EXISTS
        - PERMISSION_DENIED
        - UNAUTHENTICATED
        - RESOURCE_EXHAUSTED
        - FAILED_PRECONDITION
        - ABORTED
        - OUT_OF_RANGE
        - UNIMPLEMENTED
        - INTERNAL
        - UNAVAILABLE
        - DATA_LOSS
      default: OK
      description: |-
        - OK: Not an error; returned on success.

        HTTP Mapping: 200 OK
         - CANCELLED: The operation was cancelled, typically by the caller.

        HTTP Mapping: 499 Client Closed Request
         - UNKNOWN: Unknown error.  For example, this error may be returned when
        a `Status` value received from another address space belongs to
        an error space that is not known in this address space.  Also
        errors raised by APIs that do not return enough error information
        may be converted to this error.

        HTTP Mapping: 500 Internal Server Error
         - INVALID_ARGUMENT: The client specified an invalid argument.  Note that this differs
        from `FAILED_PRECONDITION`.  `INVALID_ARGUMENT` indicates arguments
        that are problematic regardless of the state of the system
        (e.g., a malformed file name).

        HTTP Mapping: 400 Bad Request
         - DEADLINE_EXCEEDED: The deadline expired before the operation could complete. For operations
        that change the state of the system, this error may be returned
        even if the operation has completed successfully.  For example, a
        successful response from a server could have been delayed long
        enough for the deadline to expire.

        HTTP Mapping: 504 Gateway Timeout
         - NOT_FOUND: Some requested entity (e.g., file or directory) was not found.

        Note to server developers: if a request is denied for an entire class
        of users, such as gradual feature rollout or undocumented allowlist,
        `NOT_FOUND` may be used. If a request is denied for some users within
        a class of users, such as user-based access control, `PERMISSION_DENIED`
        must be used.

        HTTP Mapping: 404 Not Found
         - ALREADY_EXISTS: The entity that a client attempted to create (e.g., file or directory)
        already exists.

        HTTP Mapping: 409 Conflict
         - PERMISSION_DENIED: The caller does not have permission to execute the specified
        operation. `PERMISSION_DENIED` must not be used for rejections
        caused by exhausting some resource (use `RESOURCE_EXHAUSTED`
        instead for those errors). `PERMISSION_DENIED` must not be
        used if the caller can not be identified (use `UNAUTHENTICATED`
        instead for those errors). This error code does not imply the
        request is valid or the requested entity exists or satisfies
        other pre-conditions.

        HTTP Mapping: 403 Forbidden
         - UNAUTHENTICATED: The request does not have valid authentication credentials for the
        operation.

        HTTP Mapping: 401 Unauthorized
         - RESOURCE_EXHAUSTED: Some resource has been exhausted, perhaps a per-user quota, or
        perhaps the entire file system is out of space.

        HTTP Mapping: 429 Too Many Requests
         - FAILED_PRECONDITION: The operation was rejected because the system is not in a state
        required for the operation's execution.  For example, the directory
        to be deleted is non-empty, an rmdir operation is applied to
        a non-directory, etc.

        Service implementors can use the following guidelines to decide
        between `FAILED_PRECONDITION`, `ABORTED`, and `UNAVAILABLE`:
         (a) Use `UNAVAILABLE` if the client can retry just the failing call.
         (b) Use `ABORTED` if the client should retry at a higher level. For
             example, when a client-specified test-and-set fails, indicating the
             client should restart a read-modify-write sequence.
         (c) Use `FAILED_PRECONDITION` if the client should not retry until
             the system state has been explicitly fixed. For example, if an "rmdir"
             fails because the directory is non-empty, `FAILED_PRECONDITION`
             should be returned since the client should not retry unless
             the files are deleted from the directory.

        HTTP Mapping: 400 Bad Request
         - ABORTED: The operation was aborted, typically due to a concurrency issue such as
        a sequencer check failure or transaction abort.

        See the guidelines above for deciding between `FAILED_PRECONDITION`,
        `ABORTED`, and `UNAVAILABLE`.

        HTTP Mapping: 409 Conflict
         - OUT_OF_RANGE: The operation was attempted past the valid range.  E.g., seeking or
        reading past end-of-file.

        Unlike `INVALID_ARGUMENT`, this error indicates a problem that may
        be fixed if the system state changes. For example, a 32-bit file
        system will generate `INVALID_ARGUMENT` if asked to read at an
        offset that is not in the range [0,2^32-1], but it will generate
        `OUT_OF_RANGE` if asked to read from an offset past the current
        file size.

        There is a fair bit of overlap between `FAILED_PRECONDITION` and
        `OUT_OF_RANGE`.  We recommend using `OUT_OF_RANGE` (the more specific
        error) when it applies so that callers who are iterating through
        a space can easily look for an `OUT_OF_RANGE` error to detect when
        they are done.

        HTTP Mapping: 400 Bad Request
         - UNIMPLEMENTED: The operation is not implemented or is not supported/enabled in this
        service.

        HTTP Mapping: 501 Not Implemented
         - INTERNAL: Internal errors.  This means that some invariants expected by the
        underlying system have been broken.  This error code is reserved
        for serious errors.

        HTTP Mapping: 500 Internal Server Error
         - UNAVAILABLE: The service is currently unavailable.  This is most likely a
        transient condition, which can be corrected by retrying with
        a backoff. Note that it is not always safe to retry
        non-idempotent operations.

        See the guidelines above for deciding between `FAILED_PRECONDITION`,
        `ABORTED`, and `UNAVAILABLE`.

        HTTP Mapping: 503 Service Unavailable
         - DATA_LOSS: Unrecoverable data loss or corruption.

        HTTP Mapping: 500 Internal Server Error
      title: >-
        Mimics
        [https://github.com/googleapis/googleapis/blob/master/google/rpc/code.proto]
    gatewayTrainerShardingScheme:
      type: object
      properties:
        tensorParallelism:
          type: integer
          format: int32
          description: Tensor-parallel degree. 0 means unspecified (server defaults to 1).
        pipelineParallelism:
          type: integer
          format: int32
          description: >-
            Pipeline-parallel degree. 0 means unspecified (server defaults to
            1).
        contextParallelism:
          type: integer
          format: int32
          description: Context-parallel degree. 0 means unspecified (server defaults to 1).
        expertParallelism:
          type: integer
          format: int32
          description: Expert-parallel degree. 0 means unspecified (server defaults to 1).
        sequenceParallelism:
          type: boolean
          description: Whether sequence parallelism should be enabled.
      description: Structured parallelism/sharding profile used by trainer launches.
    ReinforcementLearningLossConfigMethod:
      type: string
      enum:
        - METHOD_UNSPECIFIED
        - GRPO
        - DAPO
        - DPO
        - ORPO
        - GSPO_TOKEN
      default: METHOD_UNSPECIFIED
      title: |-
        - METHOD_UNSPECIFIED: Defaults to GRPO
         - GRPO: Group Relative Policy Optimization (default for preference jobs)
         - DAPO: Decoupled Alignment Preference Optimization
         - DPO: Direct Preference Optimization
         - ORPO: Odds Ratio Preference Optimization (reference-free)
         - GSPO_TOKEN: Group Sequence Policy Optimization (token-level)
    gatewayDpoConfig:
      type: object
      properties:
        beta:
          type: number
          format: float
          description: DPO temperature parameter (beta in the paper).
        refCacheConcurrency:
          type: integer
          format: int32
          description: Max concurrent reference forward passes during cache warm-up.
        refCacheBatchSize:
          type: integer
          format: int32
          description: >-
            Number of preference pairs per reference forward call during
            caching.
      description: Hyperparameters for Direct Preference Optimization (DPO) training.
    gatewayOrpoConfig:
      type: object
      properties:
        lambda:
          type: number
          format: float
          description: Weight for the ORPO odds-ratio loss term.
      description: Hyperparameters for Odds Ratio Preference Optimization (ORPO) training.
  securitySchemes:
    BearerAuth:
      type: http
      scheme: bearer
      description: >-
        Bearer authentication using your Fireworks API key. Format: Bearer
        <API_KEY>
      bearerFormat: API_KEY

````