Hugging Face Inference Endpoints API
Deploy and scale machine learning models with dedicated, secure infrastructure.
Deploy and scale machine learning models with dedicated, secure infrastructure.
openapi: 3.1.0
info:
title: Hugging Face Inference Endpoints API
description: >-
Deploy and scale machine learning models with dedicated, secure
infrastructure. Manage Inference Endpoints programmatically - create,
update, scale, pause, resume, and delete dedicated endpoints for serving ML
models with autoscaling and custom hardware configurations.
version: 1.0.0
termsOfService: https://huggingface.co/terms-of-service
contact:
name: Hugging Face Support
url: https://huggingface.co/support
license:
name: Apache 2.0
url: https://www.apache.org/licenses/LICENSE-2.0
servers:
- url: https://api.endpoints.huggingface.cloud/v2
description: Hugging Face Inference Endpoints management API
security:
- bearerAuth: []
tags:
- name: Endpoints
description: Manage dedicated inference endpoints
- name: Providers
description: Available cloud providers and hardware
paths:
/endpoint/{namespace}:
get:
summary: List All Endpoints
description: >-
List all Inference Endpoints for a given namespace (user or
organization).
operationId: listEndpoints
tags:
- Endpoints
parameters:
- name: namespace
in: path
required: true
description: User or organization namespace
schema:
type: string
example: my-organization
responses:
'200':
description: List of endpoints
content:
application/json:
schema:
type: object
properties:
items:
type: array
items:
$ref: '#/components/schemas/Endpoint'
examples:
Listendpoints200Example:
summary: Default listEndpoints 200 response
x-microcks-default: true
value:
items:
- name: Example Title
type: public
accountId: '500123'
provider:
vendor: aws
region: example_value
compute:
accelerator: example_value
instanceType: example_value
instanceSize: example_value
scaling: {}
model:
repository: example_value
revision: example_value
task: example_value
framework: pytorch
image: {}
status:
state: pending
message: example_value
createdAt: '2026-01-15T10:30:00Z'
updatedAt: '2026-01-15T10:30:00Z'
url: https://www.example.com
url: https://www.example.com
'401':
description: Unauthorized
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
examples:
Listendpoints401Example:
summary: Default listEndpoints 401 response
x-microcks-default: true
value:
error: example_value
statusCode: 10
x-microcks-operation:
delay: 0
dispatcher: FALLBACK
post:
summary: Create a New Endpoint
description: >-
Create a new Inference Endpoint with the specified model, hardware, and
configuration.
operationId: createEndpoint
tags:
- Endpoints
parameters:
- name: namespace
in: path
required: true
description: User or organization namespace
schema:
type: string
example: example_value
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CreateEndpointRequest'
examples:
CreateendpointRequestExample:
summary: Default createEndpoint request
x-microcks-default: true
value:
name: Example Title
type: public
provider:
vendor: aws
region: example_value
compute:
accelerator: cpu
instanceType: example_value
instanceSize: example_value
scaling:
minReplica: 10
maxReplica: 10
scaleToZeroTimeout: 10
model:
repository: example_value
revision: example_value
task: example_value
framework: pytorch
image:
huggingface: example_value
custom:
url: https://www.example.com
health_route: example_value
env: example_value
responses:
'201':
description: Endpoint created successfully
content:
application/json:
schema:
$ref: '#/components/schemas/Endpoint'
examples:
Createendpoint201Example:
summary: Default createEndpoint 201 response
x-microcks-default: true
value:
name: Example Title
type: public
accountId: '500123'
provider:
vendor: aws
region: example_value
compute:
accelerator: example_value
instanceType: example_value
instanceSize: example_value
scaling:
minReplica: 10
maxReplica: 10
scaleToZeroTimeout: 10
model:
repository: example_value
revision: example_value
task: example_value
framework: pytorch
image:
huggingface: example_value
custom:
url: https://www.example.com
port: 10
status:
state: pending
message: example_value
createdAt: '2026-01-15T10:30:00Z'
updatedAt: '2026-01-15T10:30:00Z'
url: https://www.example.com
url: https://www.example.com
'400':
description: Bad request - invalid configuration
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
examples:
Createendpoint400Example:
summary: Default createEndpoint 400 response
x-microcks-default: true
value:
error: example_value
statusCode: 10
'401':
description: Unauthorized
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
examples:
Createendpoint401Example:
summary: Default createEndpoint 401 response
x-microcks-default: true
value:
error: example_value
statusCode: 10
'409':
description: Endpoint name already exists
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
examples:
Createendpoint409Example:
summary: Default createEndpoint 409 response
x-microcks-default: true
value:
error: example_value
statusCode: 10
x-microcks-operation:
delay: 0
dispatcher: FALLBACK
/endpoint/{namespace}/{endpoint_name}:
get:
summary: Get Endpoint Details
description: Get detailed information about a specific Inference Endpoint.
operationId: getEndpoint
tags:
- Endpoints
parameters:
- name: namespace
in: path
required: true
schema:
type: string
example: example_value
- name: endpoint_name
in: path
required: true
description: Name of the endpoint
schema:
type: string
example: example_value
responses:
'200':
description: Endpoint details
content:
application/json:
schema:
$ref: '#/components/schemas/Endpoint'
examples:
Getendpoint200Example:
summary: Default getEndpoint 200 response
x-microcks-default: true
value:
name: Example Title
type: public
accountId: '500123'
provider:
vendor: aws
region: example_value
compute:
accelerator: example_value
instanceType: example_value
instanceSize: example_value
scaling:
minReplica: 10
maxReplica: 10
scaleToZeroTimeout: 10
model:
repository: example_value
revision: example_value
task: example_value
framework: pytorch
image:
huggingface: example_value
custom:
url: https://www.example.com
port: 10
status:
state: pending
message: example_value
createdAt: '2026-01-15T10:30:00Z'
updatedAt: '2026-01-15T10:30:00Z'
url: https://www.example.com
url: https://www.example.com
'404':
description: Endpoint not found
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
examples:
Getendpoint404Example:
summary: Default getEndpoint 404 response
x-microcks-default: true
value:
error: example_value
statusCode: 10
x-microcks-operation:
delay: 0
dispatcher: FALLBACK
put:
summary: Update an Endpoint
description: >-
Update the configuration of an existing Inference Endpoint including
model, hardware, and scaling settings.
operationId: updateEndpoint
tags:
- Endpoints
parameters:
- name: namespace
in: path
required: true
schema:
type: string
example: example_value
- name: endpoint_name
in: path
required: true
schema:
type: string
example: example_value
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/UpdateEndpointRequest'
examples:
UpdateendpointRequestExample:
summary: Default updateEndpoint request
x-microcks-default: true
value:
type: public
compute:
accelerator: example_value
instanceType: example_value
instanceSize: example_value
scaling:
minReplica: 10
maxReplica: 10
scaleToZeroTimeout: 10
model:
repository: example_value
revision: example_value
task: example_value
responses:
'200':
description: Endpoint updated successfully
content:
application/json:
schema:
$ref: '#/components/schemas/Endpoint'
examples:
Updateendpoint200Example:
summary: Default updateEndpoint 200 response
x-microcks-default: true
value:
name: Example Title
type: public
accountId: '500123'
provider:
vendor: aws
region: example_value
compute:
accelerator: example_value
instanceType: example_value
instanceSize: example_value
scaling:
minReplica: 10
maxReplica: 10
scaleToZeroTimeout: 10
model:
repository: example_value
revision: example_value
task: example_value
framework: pytorch
image:
huggingface: example_value
custom:
url: https://www.example.com
port: 10
status:
state: pending
message: example_value
createdAt: '2026-01-15T10:30:00Z'
updatedAt: '2026-01-15T10:30:00Z'
url: https://www.example.com
url: https://www.example.com
'400':
description: Bad request
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
examples:
Updateendpoint400Example:
summary: Default updateEndpoint 400 response
x-microcks-default: true
value:
error: example_value
statusCode: 10
'404':
description: Endpoint not found
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
examples:
Updateendpoint404Example:
summary: Default updateEndpoint 404 response
x-microcks-default: true
value:
error: example_value
statusCode: 10
x-microcks-operation:
delay: 0
dispatcher: FALLBACK
delete:
summary: Delete an Endpoint
description: >-
Permanently delete an Inference Endpoint and all associated resources.
operationId: deleteEndpoint
tags:
- Endpoints
parameters:
- name: namespace
in: path
required: true
schema:
type: string
example: example_value
- name: endpoint_name
in: path
required: true
schema:
type: string
example: example_value
responses:
'200':
description: Endpoint deleted successfully
'404':
description: Endpoint not found
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
examples:
Deleteendpoint404Example:
summary: Default deleteEndpoint 404 response
x-microcks-default: true
value:
error: example_value
statusCode: 10
x-microcks-operation:
delay: 0
dispatcher: FALLBACK
/endpoint/{namespace}/{endpoint_name}/pause:
post:
summary: Pause an Endpoint
description: >-
Pause an Inference Endpoint to stop incurring compute costs while
preserving the configuration. The endpoint can be resumed later.
operationId: pauseEndpoint
tags:
- Endpoints
parameters:
- name: namespace
in: path
required: true
schema:
type: string
example: example_value
- name: endpoint_name
in: path
required: true
schema:
type: string
example: example_value
responses:
'200':
description: Endpoint paused successfully
content:
application/json:
schema:
$ref: '#/components/schemas/Endpoint'
examples:
Pauseendpoint200Example:
summary: Default pauseEndpoint 200 response
x-microcks-default: true
value:
name: Example Title
type: public
accountId: '500123'
provider:
vendor: aws
region: example_value
compute:
accelerator: example_value
instanceType: example_value
instanceSize: example_value
scaling:
minReplica: 10
maxReplica: 10
scaleToZeroTimeout: 10
model:
repository: example_value
revision: example_value
task: example_value
framework: pytorch
image:
huggingface: example_value
custom:
url: https://www.example.com
port: 10
status:
state: pending
message: example_value
createdAt: '2026-01-15T10:30:00Z'
updatedAt: '2026-01-15T10:30:00Z'
url: https://www.example.com
url: https://www.example.com
'404':
description: Endpoint not found
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
examples:
Pauseendpoint404Example:
summary: Default pauseEndpoint 404 response
x-microcks-default: true
value:
error: example_value
statusCode: 10
x-microcks-operation:
delay: 0
dispatcher: FALLBACK
/endpoint/{namespace}/{endpoint_name}/resume:
post:
summary: Resume an Endpoint
description: Resume a previously paused Inference Endpoint.
operationId: resumeEndpoint
tags:
- Endpoints
parameters:
- name: namespace
in: path
required: true
schema:
type: string
example: example_value
- name: endpoint_name
in: path
required: true
schema:
type: string
example: example_value
responses:
'200':
description: Endpoint resumed successfully
content:
application/json:
schema:
$ref: '#/components/schemas/Endpoint'
examples:
Resumeendpoint200Example:
summary: Default resumeEndpoint 200 response
x-microcks-default: true
value:
name: Example Title
type: public
accountId: '500123'
provider:
vendor: aws
region: example_value
compute:
accelerator: example_value
instanceType: example_value
instanceSize: example_value
scaling:
minReplica: 10
maxReplica: 10
scaleToZeroTimeout: 10
model:
repository: example_value
revision: example_value
task: example_value
framework: pytorch
image:
huggingface: example_value
custom:
url: https://www.example.com
port: 10
status:
state: pending
message: example_value
createdAt: '2026-01-15T10:30:00Z'
updatedAt: '2026-01-15T10:30:00Z'
url: https://www.example.com
url: https://www.example.com
'404':
description: Endpoint not found
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
examples:
Resumeendpoint404Example:
summary: Default resumeEndpoint 404 response
x-microcks-default: true
value:
error: example_value
statusCode: 10
x-microcks-operation:
delay: 0
dispatcher: FALLBACK
/endpoint/{namespace}/{endpoint_name}/scale-to-zero:
post:
summary: Scale Endpoint to Zero
description: >-
Scale the endpoint to zero replicas. The endpoint will automatically
scale up when it receives traffic (if autoscaling is configured).
operationId: scaleToZero
tags:
- Endpoints
parameters:
- name: namespace
in: path
required: true
schema:
type: string
example: example_value
- name: endpoint_name
in: path
required: true
schema:
type: string
example: example_value
responses:
'200':
description: Endpoint scaled to zero
content:
application/json:
schema:
$ref: '#/components/schemas/Endpoint'
examples:
Scaletozero200Example:
summary: Default scaleToZero 200 response
x-microcks-default: true
value:
name: Example Title
type: public
accountId: '500123'
provider:
vendor: aws
region: example_value
compute:
accelerator: example_value
instanceType: example_value
instanceSize: example_value
scaling:
minReplica: 10
maxReplica: 10
scaleToZeroTimeout: 10
model:
repository: example_value
revision: example_value
task: example_value
framework: pytorch
image:
huggingface: example_value
custom:
url: https://www.example.com
port: 10
status:
state: pending
message: example_value
createdAt: '2026-01-15T10:30:00Z'
updatedAt: '2026-01-15T10:30:00Z'
url: https://www.example.com
url: https://www.example.com
'404':
description: Endpoint not found
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
examples:
Scaletozero404Example:
summary: Default scaleToZero 404 response
x-microcks-default: true
value:
error: example_value
statusCode: 10
x-microcks-operation:
delay: 0
dispatcher: FALLBACK
/endpoint/{namespace}/{endpoint_name}/logs:
get:
summary: Get Endpoint Logs
description: Retrieve the runtime logs for an Inference Endpoint.
operationId: getEndpointLogs
tags:
- Endpoints
parameters:
- name: namespace
in: path
required: true
schema:
type: string
example: example_value
- name: endpoint_name
in: path
required: true
schema:
type: string
example: example_value
responses:
'200':
description: Endpoint logs
content:
text/plain:
schema:
type: string
examples:
Getendpointlogs200Example:
summary: Default getEndpointLogs 200 response
x-microcks-default: true
value: example_value
'404':
description: Endpoint not found
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
examples:
Getendpointlogs404Example:
summary: Default getEndpointLogs 404 response
x-microcks-default: true
value:
error: example_value
statusCode: 10
x-microcks-operation:
delay: 0
dispatcher: FALLBACK
/endpoint/{namespace}/{endpoint_name}/metrics:
get:
summary: Get Endpoint Metrics
description: >-
Retrieve performance metrics for an Inference Endpoint including request
counts, latencies, and error rates.
operationId: getEndpointMetrics
tags:
- Endpoints
parameters:
- name: namespace
in: path
required: true
schema:
type: string
example: example_value
- name: endpoint_name
in: path
required: true
schema:
type: string
example: example_value
responses:
'200':
description: Endpoint metrics
content:
application/json:
schema:
$ref: '#/components/schemas/EndpointMetrics'
examples:
Getendpointmetrics200Example:
summary: Default getEndpointMetrics 200 response
x-microcks-default: true
value:
request_count: 10
request_duration_ms:
p50: 42.5
p90: 42.5
p99: 42.5
error_rate: 42.5
tokens_per_second: 42.5
x-microcks-operation:
delay: 0
dispatcher: FALLBACK
/provider:
get:
summary: List Available Providers
description: >-
List available cloud providers and their regions for deploying Inference
Endpoints.
operationId: listProviders
tags:
- Providers
responses:
'200':
description: Available cloud providers
content:
application/json:
schema:
type: object
properties:
items:
type: array
items:
$ref: '#/components/schemas/Provider'
examples:
Listproviders200Example:
summary: Default listProviders 200 response
x-microcks-default: true
value:
items:
- vendor: aws
region: example_value
status: available
accelerators:
- {}
x-microcks-operation:
delay: 0
dispatcher: FALLBACK
components:
securitySchemes:
bearerAuth:
type: http
scheme: bearer
bearerFormat: HF Token
description: >-
Hugging Face user access token with Inference Endpoints permissions.
schemas:
Endpoint:
type: object
properties:
name:
type: string
description: Endpoint name
example: my-text-gen-endpoint
type:
type: string
description: Endpoint type
enum:
- public
- protected
- private
example: public
accountId:
type: string
example: '500123'
provider:
type: object
properties:
vendor:
type: string
description: Cloud vendor
enum:
- aws
- azure
- gcp
region:
type: string
description: Cloud region
example: us-east-1
example: example_value
compute:
type: object
properties:
accelerator:
type: string
description: GPU or accelerator type
example: gpu
instanceType:
type: string
description: Instance type identifier
example: nvidia-a10g
instanceSize:
type: string
description: Instance size
example: x1
scaling:
type: object
properties:
minReplica:
type: integer
description: Minimum number of replicas
example: 0
maxReplica:
type: integer
description: Maximum number of replicas
example: 2
scaleToZeroTimeout:
type: integer
description: Minutes of inactivity before scaling to zero
example: 15
example: example_value
model:
type: object
properties:
repository:
type: string
description: Model repository ID on the Hub
example: meta-llama/Llama-2-7b-chat-hf
revision:
type: string
description: Model revision or branch
example: main
task:
type: string
description: Inference task
example: text-generation
framework:
type: string
description: Serving framework
enum:
- pytorch
- custom
image:
type: object
properties:
huggingface:
type: object
description: Hugging Face optimized container settings
custom:
type: object
description: Custom container settings
properties:
url:
type: string
format: uri
port:
type: integer
example: example_value
status:
type: object
properties:
state:
type: string
description: Current endpoint state
enum:
- pending
- initializing
- running
- updating
- paused
- scaledToZero
- failed
message:
type: string
description: Human-readable status message
createdAt:
type: string
format: date-time
updatedAt:
type: string
format: date-time
url:
type: string
format: uri
description: Inference URL for the running endpoint
example: example_value
url:
type: string
format: uri
description: Inference URL for the endpoint
example: https://www.example.com
CreateEndpointRequest:
type: object
required:
- name
- type
- provid
# --- truncated at 32 KB (37 KB total) ---
# Full source: https://raw.githubusercontent.com/api-evangelist/hugging-face/refs/heads/main/openapi/hugging-face-inference-endpoints-api.yml