Prime Intellect

Prime Intellect Inference API

OpenAI-compatible inference API for hosted frontier and open models served at api.pinference.ai. Supports streaming chat completions, the full set of OpenAI parameters (temperature, top_p, max_tokens, logprobs), and returns a `usage` object with input/output token counts and USD cost on every response. LoRA adapters can be served alongside base models via 1-click deployments.

Documentation GitHub OpenAPI

Documentation

📖

Documentation

https://docs.primeintellect.ai/inference/overview

📖

Documentation

https://docs.primeintellect.ai/inference/usage

📖

Documentation

https://docs.primeintellect.ai/inference/adapter-deployments

Specifications

⚙

OpenAPI

https://raw.githubusercontent.com/api-evangelist/prime-intellect/refs/heads/main/openapi/prime-intellect-inference-api-openapi.yml

OpenAPI Specification

openapi: 3.1.0
info:
  title: Prime Intellect Compute admin-clusters Inference API
  version: 0.1.0
  description: 'GPU compute marketplace and pod orchestration: availability across providers, on-demand and multi-node GPU pod lifecycle, persistent network-attached disks, and SSH key management. Supports H100, H200, B200, B300, and other GPU families with 1-256 GPU configurations.'
  contact:
    name: Prime Intellect
    url: https://www.primeintellect.ai
servers:
- url: https://api.primeintellect.ai
security:
- HTTPBearer: []
tags:
- name: Inference
paths:
  /models:
    get:
      tags:
      - Inference
      summary: Prime Intellect List Inference Models
      description: List models available through the Prime Intellect inference API.
      operationId: listInferenceModels
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ModelList'
  /chat/completions:
    post:
      tags:
      - Inference
      summary: Prime Intellect Create Chat Completion
      description: OpenAI-compatible chat completions endpoint. Supports streaming, temperature, token limits, and logprobs. Responses include a usage object with input/output tokens and cost.
      operationId: createChatCompletion
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ChatCompletionRequest'
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ChatCompletionResponse'
components:
  schemas:
    Usage:
      type: object
      properties:
        prompt_tokens:
          type: integer
        completion_tokens:
          type: integer
        total_tokens:
          type: integer
        cost:
          type: number
    ChatMessage:
      type: object
      required:
      - role
      - content
      properties:
        role:
          type: string
          enum:
          - system
          - user
          - assistant
          - tool
        content:
          type: string
    ChatCompletionRequest:
      type: object
      required:
      - model
      - messages
      properties:
        model:
          type: string
        messages:
          type: array
          items:
            $ref: '#/components/schemas/ChatMessage'
        stream:
          type: boolean
          default: false
        temperature:
          type: number
        top_p:
          type: number
        max_tokens:
          type: integer
        logprobs:
          type: boolean
    ModelList:
      type: object
      properties:
        object:
          type: string
          example: list
        data:
          type: array
          items:
            $ref: '#/components/schemas/Model'
    ChatCompletionChoice:
      type: object
      properties:
        index:
          type: integer
        message:
          $ref: '#/components/schemas/ChatMessage'
        finish_reason:
          type: string
    Model:
      type: object
      properties:
        id:
          type: string
        object:
          type: string
          example: model
        created:
          type: integer
        owned_by:
          type: string
    ChatCompletionResponse:
      type: object
      properties:
        id:
          type: string
        object:
          type: string
          example: chat.completion
        created:
          type: integer
        model:
          type: string
        choices:
          type: array
          items:
            $ref: '#/components/schemas/ChatCompletionChoice'
        usage:
          $ref: '#/components/schemas/Usage'
  securitySchemes:
    HTTPBearer:
      type: http
      scheme: bearer