Inferless
Inferless Inference Endpoints API

Each deployed model exposes an auto-generated REST inference endpoint on a per-deployment host (m-..model-v1.inferless.com) accepting a KServe v2 style inputs[] payload with name, shape, datatype, and data, secured with a workspace API key as a Bearer token and billed per second of GPU compute.
Documentation GitHub OpenAPI
OpenAPI Specification

openapi: 3.0.1
info:
  title: Inferless API
  description: >-
    Specification of the Inferless serverless GPU inference platform. Covers the
    per-deployment model inference endpoint (KServe v2 style inputs[] payload) and
    the workspace-scoped REST management API for model settings and logs. The
    inference host is auto-generated per deployment
    (m-<id>.<region>.model-v1.inferless.com); management APIs are served from
    https://api.inferless.com. All requests authenticate with a workspace API key
    passed as a Bearer token in the Authorization header.
  termsOfService: https://www.inferless.com/terms
  contact:
    name: Inferless Support
    url: https://www.inferless.com
  version: '1.0'
servers:
  - url: https://api.inferless.com
    description: Inferless management API base URL.
  - url: https://{modelId}.{region}.model-v1.inferless.com
    description: >-
      Per-deployment inference host. The full URL is auto-generated and shown on
      the model's API page in the Inferless console.
    variables:
      modelId:
        default: m-xxxxxxxx
        description: The deployed model identifier prefix.
      region:
        default: default
        description: The deployment region segment.
security:
  - bearerAuth: []
paths:
  /v2/inference/{model_name}/infer:
    post:
      operationId: runInference
      tags:
        - Inference
      summary: Run inference against a deployed model.
      description: >-
        Sends an inference request to a deployed model's auto-generated endpoint.
        The request body uses the KServe v2 inputs[] structure (name, shape,
        datatype, data). This path is served from the per-deployment inference
        host, not from api.inferless.com.
      servers:
        - url: https://{modelId}.{region}.model-v1.inferless.com
          variables:
            modelId:
              default: m-xxxxxxxx
            region:
              default: default
      parameters:
        - name: model_name
          in: path
          required: true
          schema:
            type: string
          description: The model name segment of the auto-generated inference URL.
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/InferenceRequest'
            example:
              inputs:
                - name: prompt
                  shape: [1]
                  datatype: BYTES
                  data: ["What is AI?"]
      responses:
        '200':
          description: Inference result.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/InferenceResponse'
        '401':
          description: Missing or invalid API key.
  /rest/model/settings/update/:
    post:
      operationId: updateModelSettings
      tags:
        - Model Management
      summary: Update a model's autoscaling and machine settings.
      description: >-
        Updates a deployed model's configuration including min/max replicas,
        scale-down delay, inference timeout, dedicated vs shared, machine type,
        and container concurrency.
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ModelSettingsUpdateRequest'
            example:
              model_id: <model-id>
              data:
                min_replica: 0
                max_replica: 2
                scale_down_delay: 30
                inference_time: 120
                is_dedicated: false
                machine_type: T4
                container_concurrency: 10
                is_input_output_enabled: false
      responses:
        '200':
          description: Update result.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/StatusDetailResponse'
              example:
                status: success
                details: Model updated successfully
        '401':
          description: Missing or invalid workspace API token.
  /rest/model/logs/get/:
    post:
      operationId: getModelLogs
      tags:
        - Model Management
      summary: Retrieve runtime logs for a deployed model.
      description: >-
        Returns stdout/stderr log entries for a deployed model over a time range,
        with optional pagination via next_token.
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ModelLogsGetRequest'
            example:
              model_id: <model-id>
              time_from: '2026-06-19T00:00:00Z'
              time_to: '2026-06-20T00:00:00Z'
              is_less_logs: false
      responses:
        '200':
          description: Model logs.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ModelLogsResponse'
        '401':
          description: Missing or invalid workspace API token.
components:
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      description: Workspace API key passed as a Bearer token in the Authorization header.
  schemas:
    InferenceRequest:
      type: object
      required:
        - inputs
      properties:
        inputs:
          type: array
          items:
            $ref: '#/components/schemas/InferenceTensor'
    InferenceTensor:
      type: object
      required:
        - name
        - shape
        - datatype
        - data
      properties:
        name:
          type: string
          description: Input parameter name as defined in the model input schema.
          example: prompt
        shape:
          type: array
          items:
            type: integer
          description: Tensor dimensions; use [-1] for variable-length inputs.
          example: [1]
        datatype:
          type: string
          description: Data type of the input.
          enum:
            - BYTES
            - STRING
            - INT8
            - INT32
            - FP32
          example: BYTES
        data:
          type: array
          items: {}
          description: The input values matching the declared shape and datatype.
          example: ["What is AI?"]
    InferenceResponse:
      type: object
      description: >-
        Model output. Inferless returns the dictionary produced by the model's
        inference function, optionally wrapped in an outputs[] array.
      properties:
        outputs:
          type: array
          items:
            $ref: '#/components/schemas/InferenceTensor'
    ModelSettingsUpdateRequest:
      type: object
      required:
        - model_id
        - data
      properties:
        model_id:
          type: string
        data:
          type: object
          properties:
            min_replica:
              type: integer
            max_replica:
              type: integer
            scale_down_delay:
              type: integer
            inference_time:
              type: integer
            is_dedicated:
              type: boolean
            machine_type:
              type: string
              example: T4
            container_concurrency:
              type: integer
            is_input_output_enabled:
              type: boolean
    ModelLogsGetRequest:
      type: object
      required:
        - model_id
        - time_from
        - time_to
      properties:
        model_id:
          type: string
        time_from:
          type: string
        time_to:
          type: string
        is_less_logs:
          type: boolean
        next_token:
          type: string
    ModelLogsResponse:
      type: object
      properties:
        status:
          type: string
          example: success
        details:
          type: array
          items:
            type: object
            properties:
              time:
                type: string
              log:
                type: string
              stream:
                type: string
                enum:
                  - stderr
                  - stdout
        next_token:
          type: string
    StatusDetailResponse:
      type: object
      properties:
        status:
          type: string
          example: success
        details:
          type: string
Inferless Inference Endpoints API

Documentation

Specifications

Other Resources

OpenAPI Specification