Predibase Deployments API

Create, read, update, and delete dedicated and private serverless deployments, selecting a base model and GPU accelerator (A10, A100) and enabling LoRA serving for fine-tuned adapters.

OpenAPI Specification

predibase-openapi.yml Raw ↑
openapi: 3.0.1
info:
  title: Predibase API
  description: >-
    Specification of the Predibase API surfaces documented at
    https://docs.predibase.com. Two planes are covered: (1) the inference data
    plane on https://serving.app.predibase.com, which exposes an
    OpenAI-compatible chat/completions interface plus native generate /
    generate_stream text-generation endpoints, scoped per tenant and
    deployment; and (2) the control plane on https://api.app.predibase.com,
    which manages fine-tuning jobs, adapter repositories, deployments, datasets,
    and base models. All endpoints authenticate with a Predibase API token sent
    as an HTTP Bearer token.
  termsOfService: https://predibase.com/terms-of-service
  contact:
    name: Predibase Support
    email: support@predibase.com
    url: https://docs.predibase.com
  version: '2.0'
servers:
  - url: https://serving.app.predibase.com/{tenant}/deployments/v2/llms/{model}
    description: >-
      Inference (serving) base. tenant is your Predibase tenant ID (Settings >
      My Profile); model is the deployment name (Deployments page). The
      OpenAI-compatible routes live under the /v1 suffix of this base.
    variables:
      tenant:
        default: TENANT_ID
        description: Predibase tenant ID.
      model:
        default: DEPLOYMENT_NAME
        description: Deployment name (base model deployment).
  - url: https://api.app.predibase.com/v2
    description: Control plane base for fine-tuning, adapters, deployments, datasets, and models.
security:
  - bearerAuth: []
paths:
  /v1/chat/completions:
    post:
      operationId: createChatCompletion
      tags:
        - Inference
      summary: OpenAI-compatible chat completion.
      description: >-
        Creates a chat completion against the deployment named in the server
        URL. OpenAI Chat Completions v1 compatible. Set model to an adapter ID
        ("repo"/"version") to serve a fine-tuned LoRA on top of the base model,
        or leave it empty to use the base model. Set stream to true to receive
        Server-Sent Events.
      servers:
        - url: https://serving.app.predibase.com/{tenant}/deployments/v2/llms/{model}
          variables:
            tenant:
              default: TENANT_ID
            model:
              default: DEPLOYMENT_NAME
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ChatCompletionRequest'
      responses:
        '200':
          description: A chat completion (or an SSE stream when stream is true).
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ChatCompletionResponse'
            text/event-stream:
              schema:
                type: string
  /v1/completions:
    post:
      operationId: createCompletion
      tags:
        - Inference
      summary: OpenAI-compatible text completion.
      description: >-
        Creates a text completion against the deployment named in the server
        URL. OpenAI Completions v1 compatible. Set model to an adapter ID to
        serve a fine-tuned LoRA.
      servers:
        - url: https://serving.app.predibase.com/{tenant}/deployments/v2/llms/{model}
          variables:
            tenant:
              default: TENANT_ID
            model:
              default: DEPLOYMENT_NAME
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CompletionRequest'
      responses:
        '200':
          description: A text completion.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/CompletionResponse'
  /generate:
    post:
      operationId: generate
      tags:
        - Inference
      summary: Native text generation.
      description: >-
        Generates text from a deployed model. The full URL is
        https://serving.app.predibase.com/{tenant}/deployments/v2/llms/{deployment}/generate.
        Optionally specify a LoRA adapter via parameters.adapter_id and
        parameters.adapter_source (pbase, hub, or s3).
      servers:
        - url: https://serving.app.predibase.com/{tenant}/deployments/v2/llms/{model}
          variables:
            tenant:
              default: TENANT_ID
            model:
              default: DEPLOYMENT_NAME
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/GenerateRequest'
      responses:
        '200':
          description: Generated text.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/GenerateResponse'
  /generate_stream:
    post:
      operationId: generateStream
      tags:
        - Inference
      summary: Native streaming text generation.
      description: >-
        Streams generated tokens from a deployed model as Server-Sent Events.
        The full URL is
        https://serving.app.predibase.com/{tenant}/deployments/v2/llms/{deployment}/generate_stream.
      servers:
        - url: https://serving.app.predibase.com/{tenant}/deployments/v2/llms/{model}
          variables:
            tenant:
              default: TENANT_ID
            model:
              default: DEPLOYMENT_NAME
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/GenerateRequest'
      responses:
        '200':
          description: An SSE stream of generated token events.
          content:
            text/event-stream:
              schema:
                type: string
  /finetuning/jobs:
    post:
      operationId: createFinetuningJob
      tags:
        - Fine-Tuning
      summary: Create a fine-tuning job.
      description: >-
        Starts a supervised or reinforcement (GRPO) fine-tuning job that trains
        a LoRA / Turbo LoRA adapter on a base model using a connected dataset,
        producing a new adapter version in the target repository.
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/FinetuningJobRequest'
      responses:
        '200':
          description: The created fine-tuning job.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/FinetuningJob'
    get:
      operationId: listFinetuningJobs
      tags:
        - Fine-Tuning
      summary: List fine-tuning jobs.
      responses:
        '200':
          description: A list of fine-tuning jobs.
  /finetuning/jobs/{jobId}:
    get:
      operationId: getFinetuningJob
      tags:
        - Fine-Tuning
      summary: Get a fine-tuning job.
      parameters:
        - name: jobId
          in: path
          required: true
          schema:
            type: string
      responses:
        '200':
          description: The fine-tuning job.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/FinetuningJob'
  /finetuning/jobs/{jobId}/cancel:
    post:
      operationId: cancelFinetuningJob
      tags:
        - Fine-Tuning
      summary: Cancel a fine-tuning job.
      parameters:
        - name: jobId
          in: path
          required: true
          schema:
            type: string
      responses:
        '200':
          description: The cancelled fine-tuning job.
  /repos:
    post:
      operationId: createRepo
      tags:
        - Adapters
      summary: Create an adapter repository.
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/RepoRequest'
      responses:
        '200':
          description: The created adapter repository.
    get:
      operationId: listRepos
      tags:
        - Adapters
      summary: List adapter repositories.
      responses:
        '200':
          description: A list of adapter repositories.
  /repos/{repoName}:
    get:
      operationId: getRepo
      tags:
        - Adapters
      summary: Get an adapter repository.
      parameters:
        - name: repoName
          in: path
          required: true
          schema:
            type: string
      responses:
        '200':
          description: The adapter repository and its versions.
    delete:
      operationId: deleteRepo
      tags:
        - Adapters
      summary: Delete an adapter repository.
      parameters:
        - name: repoName
          in: path
          required: true
          schema:
            type: string
      responses:
        '200':
          description: The repository was deleted.
  /deployments:
    post:
      operationId: createDeployment
      tags:
        - Deployments
      summary: Create a dedicated deployment.
      description: >-
        Creates a dedicated or private serverless deployment of a base model on
        a selected GPU accelerator (e.g. a10_24gb, a100_80gb), with LoRA serving
        enabled for fine-tuned adapters.
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/DeploymentRequest'
      responses:
        '200':
          description: The created deployment.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Deployment'
    get:
      operationId: listDeployments
      tags:
        - Deployments
      summary: List deployments.
      responses:
        '200':
          description: A list of deployments.
  /deployments/{deploymentName}:
    get:
      operationId: getDeployment
      tags:
        - Deployments
      summary: Get a deployment.
      parameters:
        - name: deploymentName
          in: path
          required: true
          schema:
            type: string
      responses:
        '200':
          description: The deployment.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Deployment'
    delete:
      operationId: deleteDeployment
      tags:
        - Deployments
      summary: Delete a deployment.
      parameters:
        - name: deploymentName
          in: path
          required: true
          schema:
            type: string
      responses:
        '200':
          description: The deployment was deleted.
  /datasets:
    post:
      operationId: createDataset
      tags:
        - Datasets
      summary: Connect or upload a dataset.
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/DatasetRequest'
      responses:
        '200':
          description: The created dataset.
    get:
      operationId: listDatasets
      tags:
        - Datasets
      summary: List datasets.
      responses:
        '200':
          description: A list of datasets.
  /datasets/{datasetName}:
    get:
      operationId: getDataset
      tags:
        - Datasets
      summary: Get a dataset.
      parameters:
        - name: datasetName
          in: path
          required: true
          schema:
            type: string
      responses:
        '200':
          description: The dataset.
  /models:
    get:
      operationId: listModels
      tags:
        - Models
      summary: List supported base models.
      responses:
        '200':
          description: A list of supported base models.
  /batch-inference/jobs:
    post:
      operationId: createBatchInferenceJob
      tags:
        - Batch Inference
      summary: Create a batch inference job.
      description: >-
        Launches an asynchronous batch inference job against a base model with
        optional per-row adapter selection. Predibase deploys the target base
        model and loads any required adapters automatically.
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/BatchInferenceJobRequest'
      responses:
        '200':
          description: The created batch inference job.
    get:
      operationId: listBatchInferenceJobs
      tags:
        - Batch Inference
      summary: List batch inference jobs.
      responses:
        '200':
          description: A list of batch inference jobs.
  /batch-inference/jobs/{jobId}:
    get:
      operationId: getBatchInferenceJob
      tags:
        - Batch Inference
      summary: Get a batch inference job.
      parameters:
        - name: jobId
          in: path
          required: true
          schema:
            type: string
      responses:
        '200':
          description: The batch inference job.
components:
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      bearerFormat: Predibase API token
      description: >-
        Predibase API token sent as Authorization: Bearer <PREDIBASE_API_TOKEN>.
        Generate a token from Settings in the Predibase console.
  schemas:
    ChatCompletionRequest:
      type: object
      required:
        - messages
      properties:
        model:
          type: string
          description: >-
            Adapter ID in the form "repo"/"version" to serve a fine-tuned LoRA,
            or empty to use the deployment's base model.
        messages:
          type: array
          items:
            $ref: '#/components/schemas/ChatMessage'
        max_tokens:
          type: integer
        temperature:
          type: number
        top_p:
          type: number
        stream:
          type: boolean
          description: When true, the response is delivered as Server-Sent Events.
    ChatMessage:
      type: object
      required:
        - role
        - content
      properties:
        role:
          type: string
          enum:
            - system
            - user
            - assistant
        content:
          type: string
    ChatCompletionResponse:
      type: object
      properties:
        id:
          type: string
        object:
          type: string
        created:
          type: integer
        model:
          type: string
        choices:
          type: array
          items:
            type: object
            properties:
              index:
                type: integer
              message:
                $ref: '#/components/schemas/ChatMessage'
              finish_reason:
                type: string
        usage:
          $ref: '#/components/schemas/Usage'
    CompletionRequest:
      type: object
      required:
        - prompt
      properties:
        model:
          type: string
          description: Adapter ID ("repo"/"version") or empty for the base model.
        prompt:
          type: string
        max_tokens:
          type: integer
        temperature:
          type: number
        stream:
          type: boolean
    CompletionResponse:
      type: object
      properties:
        id:
          type: string
        object:
          type: string
        choices:
          type: array
          items:
            type: object
            properties:
              text:
                type: string
              index:
                type: integer
              finish_reason:
                type: string
        usage:
          $ref: '#/components/schemas/Usage'
    GenerateRequest:
      type: object
      required:
        - inputs
      properties:
        inputs:
          type: string
          description: The prompt text. Include the fine-tuning prompt template when querying fine-tuned models.
        parameters:
          type: object
          properties:
            max_new_tokens:
              type: integer
            temperature:
              type: number
            adapter_id:
              type: string
              description: Adapter to apply, e.g. "my-repo/1".
            adapter_source:
              type: string
              enum:
                - pbase
                - hub
                - s3
              description: Where the adapter is loaded from.
    GenerateResponse:
      type: object
      properties:
        generated_text:
          type: string
        details:
          type: object
    Usage:
      type: object
      properties:
        prompt_tokens:
          type: integer
        completion_tokens:
          type: integer
        total_tokens:
          type: integer
    FinetuningJobRequest:
      type: object
      required:
        - base_model
        - dataset
      properties:
        base_model:
          type: string
        dataset:
          type: string
        repo:
          type: string
          description: Target adapter repository for the resulting adapter version.
        adapter_type:
          type: string
          enum:
            - lora
            - turbo_lora
        task:
          type: string
          enum:
            - sft
            - grpo
          description: Supervised fine-tuning (sft) or reinforcement fine-tuning (grpo).
    FinetuningJob:
      type: object
      properties:
        id:
          type: string
        status:
          type: string
        base_model:
          type: string
        repo:
          type: string
    RepoRequest:
      type: object
      required:
        - name
      properties:
        name:
          type: string
        description:
          type: string
    DeploymentRequest:
      type: object
      required:
        - name
        - base_model
      properties:
        name:
          type: string
        base_model:
          type: string
        accelerator:
          type: string
          description: GPU accelerator, e.g. a10_24gb or a100_80gb.
        min_replicas:
          type: integer
        max_replicas:
          type: integer
    Deployment:
      type: object
      properties:
        name:
          type: string
        base_model:
          type: string
        accelerator:
          type: string
        status:
          type: string
    DatasetRequest:
      type: object
      required:
        - name
      properties:
        name:
          type: string
        source:
          type: string
    BatchInferenceJobRequest:
      type: object
      required:
        - base_model
        - dataset
      properties:
        base_model:
          type: string
        dataset:
          type: string
        output:
          type: string