Create Deployment

curl --request POST \
  --url https://api.fireworks.ai/v1/accounts/{account_id}/deployments \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "baseModel": "<string>",
  "displayName": "<string>",
  "description": "<string>",
  "expireTime": "2023-11-07T05:31:56Z",
  "minReplicaCount": 123,
  "maxReplicaCount": 123,
  "autoscalingPolicy": {
    "scaleUpWindow": "<string>",
    "scaleDownWindow": "<string>",
    "scaleToZeroWindow": "<string>",
    "loadTargets": {}
  },
  "acceleratorCount": 123,
  "acceleratorType": "ACCELERATOR_TYPE_UNSPECIFIED",
  "precision": "PRECISION_UNSPECIFIED",
  "enableAddons": true,
  "draftTokenCount": 123,
  "draftModel": "<string>",
  "ngramSpeculationLength": 123,
  "enableSessionAffinity": true,
  "directRouteApiKeys": [
    "<string>"
  ],
  "directRouteType": "DIRECT_ROUTE_TYPE_UNSPECIFIED",
  "deploymentTemplate": "<string>",
  "autoTune": {
    "longPrompt": true
  },
  "placement": {
    "region": "REGION_UNSPECIFIED",
    "multiRegion": "MULTI_REGION_UNSPECIFIED",
    "regions": [
      "REGION_UNSPECIFIED"
    ]
  },
  "disableDeploymentSizeValidation": true,
  "enableMtp": true,
  "enableHotLoad": true,
  "hotLoadBucketType": "BUCKET_TYPE_UNSPECIFIED",
  "enableHotReloadLatestAddon": true,
  "deploymentShape": "<string>",
  "activeModelVersion": "<string>",
  "targetModelVersion": "<string>",
  "maxWithRevocableReplicaCount": 123
}
'

{
  "baseModel": "<string>",
  "name": "<string>",
  "displayName": "<string>",
  "description": "<string>",
  "createTime": "2023-11-07T05:31:56Z",
  "expireTime": "2023-11-07T05:31:56Z",
  "purgeTime": "2023-11-07T05:31:56Z",
  "deleteTime": "2023-11-07T05:31:56Z",
  "state": "STATE_UNSPECIFIED",
  "status": {
    "code": "OK",
    "message": "<string>"
  },
  "minReplicaCount": 123,
  "maxReplicaCount": 123,
  "desiredReplicaCount": 123,
  "replicaCount": 123,
  "autoscalingPolicy": {
    "scaleUpWindow": "<string>",
    "scaleDownWindow": "<string>",
    "scaleToZeroWindow": "<string>",
    "loadTargets": {}
  },
  "acceleratorCount": 123,
  "acceleratorType": "ACCELERATOR_TYPE_UNSPECIFIED",
  "precision": "PRECISION_UNSPECIFIED",
  "cluster": "<string>",
  "enableAddons": true,
  "draftTokenCount": 123,
  "draftModel": "<string>",
  "ngramSpeculationLength": 123,
  "enableSessionAffinity": true,
  "directRouteApiKeys": [
    "<string>"
  ],
  "numPeftDeviceCached": 123,
  "directRouteType": "DIRECT_ROUTE_TYPE_UNSPECIFIED",
  "directRouteHandle": "<string>",
  "deploymentTemplate": "<string>",
  "autoTune": {
    "longPrompt": true
  },
  "placement": {
    "region": "REGION_UNSPECIFIED",
    "multiRegion": "MULTI_REGION_UNSPECIFIED",
    "regions": [
      "REGION_UNSPECIFIED"
    ]
  },
  "region": "REGION_UNSPECIFIED",
  "updateTime": "2023-11-07T05:31:56Z",
  "disableDeploymentSizeValidation": true,
  "enableMtp": true,
  "enableHotLoad": true,
  "hotLoadBucketType": "BUCKET_TYPE_UNSPECIFIED",
  "enableHotReloadLatestAddon": true,
  "deploymentShape": "<string>",
  "activeModelVersion": "<string>",
  "targetModelVersion": "<string>",
  "replicaStats": {
    "pendingSchedulingReplicaCount": 123,
    "downloadingModelReplicaCount": 123,
    "initializingReplicaCount": 123,
    "readyReplicaCount": 123
  },
  "maxWithRevocableReplicaCount": 123
}

POST

accounts

{account_id}

deployments

Create Deployment

curl --request POST \
  --url https://api.fireworks.ai/v1/accounts/{account_id}/deployments \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "baseModel": "<string>",
  "displayName": "<string>",
  "description": "<string>",
  "expireTime": "2023-11-07T05:31:56Z",
  "minReplicaCount": 123,
  "maxReplicaCount": 123,
  "autoscalingPolicy": {
    "scaleUpWindow": "<string>",
    "scaleDownWindow": "<string>",
    "scaleToZeroWindow": "<string>",
    "loadTargets": {}
  },
  "acceleratorCount": 123,
  "acceleratorType": "ACCELERATOR_TYPE_UNSPECIFIED",
  "precision": "PRECISION_UNSPECIFIED",
  "enableAddons": true,
  "draftTokenCount": 123,
  "draftModel": "<string>",
  "ngramSpeculationLength": 123,
  "enableSessionAffinity": true,
  "directRouteApiKeys": [
    "<string>"
  ],
  "directRouteType": "DIRECT_ROUTE_TYPE_UNSPECIFIED",
  "deploymentTemplate": "<string>",
  "autoTune": {
    "longPrompt": true
  },
  "placement": {
    "region": "REGION_UNSPECIFIED",
    "multiRegion": "MULTI_REGION_UNSPECIFIED",
    "regions": [
      "REGION_UNSPECIFIED"
    ]
  },
  "disableDeploymentSizeValidation": true,
  "enableMtp": true,
  "enableHotLoad": true,
  "hotLoadBucketType": "BUCKET_TYPE_UNSPECIFIED",
  "enableHotReloadLatestAddon": true,
  "deploymentShape": "<string>",
  "activeModelVersion": "<string>",
  "targetModelVersion": "<string>",
  "maxWithRevocableReplicaCount": 123
}
'

{
  "baseModel": "<string>",
  "name": "<string>",
  "displayName": "<string>",
  "description": "<string>",
  "createTime": "2023-11-07T05:31:56Z",
  "expireTime": "2023-11-07T05:31:56Z",
  "purgeTime": "2023-11-07T05:31:56Z",
  "deleteTime": "2023-11-07T05:31:56Z",
  "state": "STATE_UNSPECIFIED",
  "status": {
    "code": "OK",
    "message": "<string>"
  },
  "minReplicaCount": 123,
  "maxReplicaCount": 123,
  "desiredReplicaCount": 123,
  "replicaCount": 123,
  "autoscalingPolicy": {
    "scaleUpWindow": "<string>",
    "scaleDownWindow": "<string>",
    "scaleToZeroWindow": "<string>",
    "loadTargets": {}
  },
  "acceleratorCount": 123,
  "acceleratorType": "ACCELERATOR_TYPE_UNSPECIFIED",
  "precision": "PRECISION_UNSPECIFIED",
  "cluster": "<string>",
  "enableAddons": true,
  "draftTokenCount": 123,
  "draftModel": "<string>",
  "ngramSpeculationLength": 123,
  "enableSessionAffinity": true,
  "directRouteApiKeys": [
    "<string>"
  ],
  "numPeftDeviceCached": 123,
  "directRouteType": "DIRECT_ROUTE_TYPE_UNSPECIFIED",
  "directRouteHandle": "<string>",
  "deploymentTemplate": "<string>",
  "autoTune": {
    "longPrompt": true
  },
  "placement": {
    "region": "REGION_UNSPECIFIED",
    "multiRegion": "MULTI_REGION_UNSPECIFIED",
    "regions": [
      "REGION_UNSPECIFIED"
    ]
  },
  "region": "REGION_UNSPECIFIED",
  "updateTime": "2023-11-07T05:31:56Z",
  "disableDeploymentSizeValidation": true,
  "enableMtp": true,
  "enableHotLoad": true,
  "hotLoadBucketType": "BUCKET_TYPE_UNSPECIFIED",
  "enableHotReloadLatestAddon": true,
  "deploymentShape": "<string>",
  "activeModelVersion": "<string>",
  "targetModelVersion": "<string>",
  "replicaStats": {
    "pendingSchedulingReplicaCount": 123,
    "downloadingModelReplicaCount": 123,
    "initializingReplicaCount": 123,
    "readyReplicaCount": 123
  },
  "maxWithRevocableReplicaCount": 123
}

Authorizations

Authorization

string

header

required

Bearer authentication using your Fireworks API key. Format: Bearer <API_KEY>

Path Parameters

account_id

string

required

The Account Id

Query Parameters

disableAutoDeploy

boolean

By default, a deployment created with a currently undeployed base model will be deployed to this deployment. If true, this auto-deploy function is disabled.

disableSpeculativeDecoding

boolean

By default, a deployment will use the speculative decoding settings from the base model. If true, this will disable speculative decoding.

deploymentId

string

The ID of the deployment. If not specified, a random ID will be generated.

validateOnly

boolean

If true, this will not create the deployment, but will return the deployment that would be created.

skipShapeValidation

boolean

By default, a deployment will ensure the deployment shape provided is validated. If true, we will not require the deployment shape to be validated.

Body

application/json

The properties of the deployment being created.

baseModel

string

required

displayName

string

Human-readable display name of the deployment. e.g. "My Deployment" Must be fewer than 64 characters long.

description

string

Description of the deployment.

expireTime

string<date-time>

The time at which this deployment will automatically be deleted.

minReplicaCount

integer<int32>

The minimum number of replicas. If not specified, the default is 0.

maxReplicaCount

integer<int32>

The maximum number of replicas. If not specified, the default is max(min_replica_count, 1). May be set to 0 to downscale the deployment to 0.

autoscalingPolicy

object

Show child attributes

acceleratorCount

integer<int32>

The number of accelerators used per replica. If not specified, the default is the estimated minimum required by the base model.

acceleratorType

enum<string>

default:ACCELERATOR_TYPE_UNSPECIFIED

The type of accelerator to use.

Available options:

ACCELERATOR_TYPE_UNSPECIFIED,

NVIDIA_A100_80GB,

NVIDIA_H100_80GB,

AMD_MI300X_192GB,

NVIDIA_A10G_24GB,

NVIDIA_A100_40GB,

NVIDIA_L4_24GB,

NVIDIA_H200_141GB,

NVIDIA_B200_180GB,

AMD_MI325X_256GB,

AMD_MI350X_288GB

precision

enum<string>

default:PRECISION_UNSPECIFIED

The precision with which the model should be served.

Available options:

PRECISION_UNSPECIFIED,

FP16,

FP8,

FP8_MM,

FP8_AR,

FP8_MM_KV_ATTN,

FP8_KV,

FP8_MM_V2,

FP8_V2,

FP8_MM_KV_ATTN_V2,

NF4,

FP4,

BF16,

FP4_BLOCKSCALED_MM,

FP4_MX_MOE

enableAddons

boolean

If true, PEFT addons are enabled for this deployment.

draftTokenCount

integer<int32>

The number of candidate tokens to generate per step for speculative decoding. Default is the base model's draft_token_count. Set CreateDeploymentRequest.disable_speculative_decoding to false to disable this behavior.

draftModel

string

The draft model name for speculative decoding. e.g. accounts/fireworks/models/my-draft-model If empty, speculative decoding using a draft model is disabled. Default is the base model's default_draft_model. Set CreateDeploymentRequest.disable_speculative_decoding to false to disable this behavior.

ngramSpeculationLength

integer<int32>

The length of previous input sequence to be considered for N-gram speculation.

enableSessionAffinity

boolean

Whether to apply sticky routing based on user field. Serverless will be set to true when creating deployment.

directRouteApiKeys

string[]

The set of API keys used to access the direct route deployment. If direct routing is not enabled, this field is unused.

directRouteType

enum<string>

default:DIRECT_ROUTE_TYPE_UNSPECIFIED

If set, this deployment will expose an endpoint that bypasses the Fireworks API gateway.

Available options:

DIRECT_ROUTE_TYPE_UNSPECIFIED,

INTERNET,

GCP_PRIVATE_SERVICE_CONNECT,

AWS_PRIVATELINK

deploymentTemplate

string

The name of the deployment template to use for this deployment. Only available to enterprise accounts.

autoTune

object

The performance profile to use for this deployment.

Show child attributes

placement

object

The desired geographic region where the deployment must be placed. If unspecified, the default is the GLOBAL multi-region.

Show child attributes

disableDeploymentSizeValidation

boolean

Whether the deployment size validation is disabled.

enableMtp

boolean

If true, MTP is enabled for this deployment.

enableHotLoad

boolean

Whether to use hot load for this deployment.

hotLoadBucketType

enum<string>

default:BUCKET_TYPE_UNSPECIFIED

Available options:

BUCKET_TYPE_UNSPECIFIED,

MINIO,

S3,

NEBIUS

enableHotReloadLatestAddon

boolean

Allows up to 1 addon at a time to be loaded, and will merge it into the base model.

deploymentShape

string

The name of the deployment shape that this deployment is using. On the server side, this will be replaced with the deployment shape version name.

activeModelVersion

string

The model version that is currently active and applied to running replicas of a deployment.

targetModelVersion

string

The target model version that is being rolled out to the deployment. In a ready steady state, the target model version is the same as the active model version.

maxWithRevocableReplicaCount

integer<int32>

max_with_revocable_replica_count is max replica count including revocable capacity. The max revocable capacity will be max_with_revocable_replica_count - max_replica_count.

Response

200 - application/json

A successful response.

baseModel

string

required

name

string

displayName

string

Human-readable display name of the deployment. e.g. "My Deployment" Must be fewer than 64 characters long.

description

string

Description of the deployment.

createTime

string<date-time>

The creation time of the deployment.

expireTime

string<date-time>

The time at which this deployment will automatically be deleted.

purgeTime

string<date-time>

The time at which the resource will be hard deleted.

deleteTime

string<date-time>

The time at which the resource will be soft deleted.

state

enum<string>

default:STATE_UNSPECIFIED

The state of the deployment.

Available options:

STATE_UNSPECIFIED,

CREATING,

READY,

DELETING,

FAILED,

UPDATING,

DELETED

status

Mimics [https://github.com/googleapis/googleapis/blob/master/google/rpc/status.proto] · object

Detailed status information regarding the most recent operation.

Show child attributes

minReplicaCount

integer<int32>

The minimum number of replicas. If not specified, the default is 0.

maxReplicaCount

integer<int32>

The maximum number of replicas. If not specified, the default is max(min_replica_count, 1). May be set to 0 to downscale the deployment to 0.

desiredReplicaCount

integer<int32>

The desired number of replicas for this deployment. This represents the target replica count that the system is trying to achieve.

replicaCount

integer<int32>

autoscalingPolicy

object

Show child attributes

acceleratorCount

integer<int32>

The number of accelerators used per replica. If not specified, the default is the estimated minimum required by the base model.

acceleratorType

enum<string>

default:ACCELERATOR_TYPE_UNSPECIFIED

The type of accelerator to use.

Available options:

ACCELERATOR_TYPE_UNSPECIFIED,

NVIDIA_A100_80GB,

NVIDIA_H100_80GB,

AMD_MI300X_192GB,

NVIDIA_A10G_24GB,

NVIDIA_A100_40GB,

NVIDIA_L4_24GB,

NVIDIA_H200_141GB,

NVIDIA_B200_180GB,

AMD_MI325X_256GB,

AMD_MI350X_288GB

precision

enum<string>

default:PRECISION_UNSPECIFIED

The precision with which the model should be served.

Available options:

PRECISION_UNSPECIFIED,

FP16,

FP8,

FP8_MM,

FP8_AR,

FP8_MM_KV_ATTN,

FP8_KV,

FP8_MM_V2,

FP8_V2,

FP8_MM_KV_ATTN_V2,

NF4,

FP4,

BF16,

FP4_BLOCKSCALED_MM,

FP4_MX_MOE

cluster

string

If set, this deployment is deployed to a cloud-premise cluster.

enableAddons

boolean

If true, PEFT addons are enabled for this deployment.

draftTokenCount

integer<int32>

draftModel

string

ngramSpeculationLength

integer<int32>

The length of previous input sequence to be considered for N-gram speculation.

enableSessionAffinity

boolean

Whether to apply sticky routing based on user field. Serverless will be set to true when creating deployment.

directRouteApiKeys

string[]

The set of API keys used to access the direct route deployment. If direct routing is not enabled, this field is unused.

numPeftDeviceCached

integer<int32>

directRouteType

enum<string>

default:DIRECT_ROUTE_TYPE_UNSPECIFIED

If set, this deployment will expose an endpoint that bypasses the Fireworks API gateway.

Available options:

DIRECT_ROUTE_TYPE_UNSPECIFIED,

INTERNET,

GCP_PRIVATE_SERVICE_CONNECT,

AWS_PRIVATELINK

directRouteHandle

string

The handle for calling a direct route. The meaning of the handle depends on the direct route type of the deployment: INTERNET -> The host name for accessing the deployment GCP_PRIVATE_SERVICE_CONNECT -> The service attachment name used to create the PSC endpoint. AWS_PRIVATELINK -> The service name used to create the VPC endpoint.

deploymentTemplate

string

The name of the deployment template to use for this deployment. Only available to enterprise accounts.

autoTune

object

The performance profile to use for this deployment.

Show child attributes

placement

object

The desired geographic region where the deployment must be placed. If unspecified, the default is the GLOBAL multi-region.

Show child attributes

region

enum<string>

default:REGION_UNSPECIFIED

The geographic region where the deployment is presently located. This region may change over time, but within the placement constraint.

Available options:

REGION_UNSPECIFIED,

US_IOWA_1,

US_VIRGINIA_1,

US_VIRGINIA_2,

US_ILLINOIS_1,

AP_TOKYO_1,

EU_LONDON_1,

US_ARIZONA_1,

US_TEXAS_1,

US_ILLINOIS_2,

EU_FRANKFURT_1,

US_TEXAS_2,

EU_PARIS_1,

EU_HELSINKI_1,

US_NEVADA_1,

EU_ICELAND_1,

EU_ICELAND_2,

US_WASHINGTON_1,

US_WASHINGTON_2,

EU_ICELAND_DEV_1,

US_WASHINGTON_3,

US_ARIZONA_2,

AP_TOKYO_2,

US_CALIFORNIA_1,

US_MISSOURI_1,

US_UTAH_1,

US_TEXAS_3,

US_ARIZONA_3,

US_GEORGIA_1,

US_GEORGIA_2,

US_WASHINGTON_4,

US_GEORGIA_3,

NA_BRITISHCOLUMBIA_1,

US_GEORGIA_4

updateTime

string<date-time>

The update time for the deployment.

disableDeploymentSizeValidation

boolean

Whether the deployment size validation is disabled.

enableMtp

boolean

If true, MTP is enabled for this deployment.

enableHotLoad

boolean

Whether to use hot load for this deployment.

hotLoadBucketType

enum<string>

default:BUCKET_TYPE_UNSPECIFIED

Available options:

BUCKET_TYPE_UNSPECIFIED,

MINIO,

S3,

NEBIUS

enableHotReloadLatestAddon

boolean

Allows up to 1 addon at a time to be loaded, and will merge it into the base model.

deploymentShape

string

The name of the deployment shape that this deployment is using. On the server side, this will be replaced with the deployment shape version name.

activeModelVersion

string

The model version that is currently active and applied to running replicas of a deployment.

targetModelVersion

string

The target model version that is being rolled out to the deployment. In a ready steady state, the target model version is the same as the active model version.

replicaStats

Next ID: 5 · object

Per-replica deployment status counters. Provides visibility into the deployment process by tracking replicas in different stages of the deployment lifecycle.

Show child attributes

maxWithRevocableReplicaCount

integer<int32>

max_with_revocable_replica_count is max replica count including revocable capacity. The max revocable capacity will be max_with_revocable_replica_count - max_replica_count.

Create embeddings

List Deployments

⌘I

API Reference

Inference

Deployments

Fine-tuning

Evals

Multimedia

Admin

Build SDK (Deprecated)

Create Deployment

Authorizations

Path Parameters

Query Parameters

Body

Response