openapi: 3.1.0
info:
  title: bitHuman
  version: "1.0"
  contact:
    name: bitHuman Support
    url: https://discord.gg/ES953n7bPA
  x-logo:
    url: https://docs.bithuman.ai/bithuman-mark.png
  description: |
    Real-time, lip-synced AI avatars. Audio in, talking video out at
    25 FPS with sub-200 ms latency. Render on the cloud, on-device, or
    in the browser — one engine, one `.imx` file.

    <div class="bh-cards">
      <a href="#tag/quickstart" class="bh-card">
        <strong>Quickstart →</strong>
        <span>Your first talking avatar in 2 minutes</span>
      </a>
      <a href="#tag/Voice" class="bh-card">
        <strong>Voice API →</strong>
        <span>TTS in 30+ languages, 10 voices, OpenAI-compatible</span>
      </a>
      <a href="#tag/sdk-python" class="bh-card">
        <strong>Python SDK →</strong>
        <span>pip install bithuman</span>
      </a>
      <a href="#tag/sdk-swift" class="bh-card">
        <strong>Swift SDK →</strong>
        <span>Mac / iPad / iPhone, on-device</span>
      </a>
    </div>

    All endpoints are relative to `https://api.bithuman.ai` and
    require an `api-secret` header.
    [Get a free API secret →](https://www.bithuman.ai/#developer)

    Some routes are intentionally excluded from this reference because
    they are internal or infrastructure — webhook receivers, `/func_*`
    generation internals, health/telemetry, and realtime session
    plumbing. They are not part of the public developer contract and
    may change without notice.

servers:
  - url: https://api.bithuman.ai
    description: Production

security:
  - apiSecret: []

paths:
  /v1/validate:
    post:
      operationId: validateApiSecret
      summary: Validate API secret
      description: |
        Verify that your API secret is valid and your account is active.

        Always returns HTTP 200. Inspect the `valid` field in the body
        to determine outcome — invalid or missing credentials yield
        `{"valid": false}` rather than a 401.
      tags: [Authentication]
      responses:
        "200":
          description: Validation complete (check the `valid` field)
          content:
            application/json:
              schema:
                type: object
                properties:
                  valid:
                    type: boolean
                    example: true

  /v1/agent/generate:
    post:
      operationId: generateAgent
      summary: Generate a new agent
      description: |
        Create a new avatar agent from a text prompt, with optional image,
        video, and audio assets. Poll `/v1/agent/status/{agent_id}` to check
        progress.

        **Creation time and inputs depend on `model`** (the second-generation
        models train real per-identity models — don't apply a 5-minute client
        timeout):

        | model | identity input | typical creation time |
        |---|---|---|
        | `essence-1` (default) | image (or generated from prompt) | 2–5 minutes |
        | `expression-1` | image (or generated from prompt) | ~1–2 minutes |
        | `essence-2-quality` | **video required** (identity prepped from real footage in seconds) | a few minutes end-to-end |
        | `essence-2-light` | video, or image (an identity video is generated first) | 45 minutes typical; long builds can take hours |
        | `essence-2` | video, or image (an identity video is generated first) | same as `essence-2-light` — **both Essence 2 tiers from one creation** |
        | `expression-2` | image (or generated from prompt) | ~45 minutes (30–60) |
        | `auto` | image or prompt (classified automatically) | as the routed model (`essence-2` or `expression-2`) |

        **Subject requirement (Essence 2 family):** `essence-2`,
        `essence-2-quality` and `essence-2-light` require a **photorealistic
        human** subject. The input is classified before anything is billed; a
        cartoon / animal / exotic-creature input is rejected with
        `422 MODEL_SUBJECT_MISMATCH` (nothing charged) — use `expression-2`
        for non-human or stylized subjects, or send `model: "auto"` to route
        automatically.

        Failures during generation surface as `status: "failed"` with an
        `error_message` on the status endpoint, and the creation credits
        are automatically refunded.
      tags: [Agent Generation]
      parameters:
        - name: Idempotency-Key
          in: header
          required: false
          schema: { type: string }
          description: Optional. A unique client-generated key (e.g. a UUID). Retrying with the same key returns the original response instead of starting a second billed job. Keys are retained for 24 hours.
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              properties:
                prompt:
                  type: string
                  description: System prompt / personality for the agent. If omitted, a random default is used.
                  example: "You are a friendly fitness coach who motivates users."
                image:
                  type: string
                  description: URL to a face image for avatar appearance.
                  example: "https://example.com/face.jpg"
                video:
                  type: string
                  description: URL to a video for avatar appearance and mannerisms.
                  example: "https://example.com/clip.mp4"
                audio:
                  type: string
                  description: URL to audio for voice cloning.
                  example: "https://example.com/voice.wav"
                aspect_ratio:
                  type: string
                  enum: ["16:9", "9:16", "1:1"]
                  default: "16:9"
                  description: Aspect ratio for image generation.
                video_aspect_ratio:
                  type: string
                  enum: ["16:9", "9:16", "1:1"]
                  default: "16:9"
                  description: Aspect ratio for video generation.
                agent_id:
                  type: string
                  description: Custom agent identifier. Auto-generated if omitted.
                duration:
                  type: integer
                  default: 10
                  description: Source video duration in seconds.
                model:
                  type: string
                  enum: [essence-1, essence-2, essence-2-quality, essence-2-light, expression-1, expression-2, essence, expression, auto]
                  default: essence-1
                  example: "auto"
                  description: >-
                    Avatar runtime model for the generated agent. Public names:
                    `essence-1`, `essence-2-quality`, `essence-2-light`,
                    `expression-1`, `expression-2` — all five **generally
                    available** (the second generation went GA 2026-07-01).
                    Essence 2 has two tiers — **Quality** (`essence-2-quality`,
                    the highest-fidelity cloud GPU renderer) and **Light**
                    (`essence-2-light`, the cost-effective distilled renderer;
                    gpu / cpu / Apple Neural Engine, including on-device).
                    **`essence-2` is the combined Essence 2 creation** (one
                    500-credit charge): it trains the Light identity model AND
                    makes Quality available from the same identity video — you
                    pick the tier at launch (`?model=essence-2-light` /
                    `essence-2-quality`). **`auto` classifies your input**
                    (the image if provided, else the prompt): a photorealistic
                    person routes to `essence-2` (combined); a cartoon, animal
                    or exotic creature routes to `expression-2`. `auto` must be
                    sent explicitly — an omitted `model` keeps the historical
                    `essence-1` default. `expression-2` is the
                    second-generation expression engine — audio-driven
                    real-time avatar video from a single photo (creation
                    trains a per-identity model, roughly 45 minutes). The bare
                    `essence` and `expression` are **version shorthands** that
                    resolve to the current default of each family
                    (`essence`→`essence-1`, `expression`→`expression-1`).
                    Retired engine names are **no longer accepted** and return
                    `400 VALIDATION_ERROR`. Identity inputs and creation times
                    differ per model — see the endpoint description table and
                    [Essence vs Expression](#tag/models).
      responses:
        "200":
          description: Generation started
          content:
            application/json:
              schema:
                type: object
                properties:
                  success:
                    type: boolean
                    example: true
                  message:
                    type: string
                    example: "Agent generation started"
                  agent_id:
                    type: string
                    example: "A91XMB7113"
                  status:
                    type: string
                    example: "processing"
        "401":
          $ref: "#/components/responses/Unauthorized"
        "402":
          description: >-
            Insufficient credits. Creation is billed **per model** — the
            second-generation families (`essence-2`, `essence-2-quality`,
            `essence-2-light`, `expression-2`, and `auto`) cost 500 credits;
            the v1 families (`essence-1`, `expression-1`) cost 250. Top up to
            continue. Returns error code `INSUFFICIENT_BALANCE`.
        "400":
          description: >-
            Body failed validation — malformed JSON, or an invalid / retired
            `model` value (rejected before dispatch; no credits charged). The
            message lists the accepted model names.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
              example:
                error:
                  code: "VALIDATION_ERROR"
                  message: "Invalid model 'my-model'; must be one of: auto, essence, essence-1, essence-2, essence-2-light, essence-2-quality, expression, expression-1, expression-2"
                  httpStatus: 400
                status: "error"
                status_code: 400
        "422":
          description: >-
            `MODEL_SUBJECT_MISMATCH` — an explicit Essence 2 creation
            (`essence-2`, `essence-2-quality`, `essence-2-light`) whose input
            is not a photorealistic human subject. The classifier runs on the
            image if provided, else the prompt, **before anything is billed**
            and before any agent row is created. Use `expression-2` for
            cartoon / animal / stylized subjects, or `model: "auto"` (which
            never rejects — it routes instead). A classifier outage never
            blocks creation (the gate fails open).
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
              example:
                error:
                  code: "MODEL_SUBJECT_MISMATCH"
                  message: "essence-2 requires a photorealistic human subject; this image looks like a cartoon — use expression-2"
                  httpStatus: 422
                status: "error"
                status_code: 422

  /v1/agent/status/{agent_id}:
    get:
      operationId: getAgentStatus
      summary: Get agent generation status
      description: |
        Poll this endpoint to check generation progress. Status values:
        `processing`, `generating`, `completed` (intermediate); `success` /
        `ready` (terminal success); `failed` (terminal error).

        **Keep polling past `generating` and `completed`** — those are
        intermediate states, not terminal. `completed` can appear early (even
        around ~5% `progress`), so do not stop on it. Only `success` / `ready`
        and `failed` mean polling should stop; gate on `progress` reaching
        `1.0` together with a terminal status.

        `current_step` reports the pipeline stage: `payment` (~2%) →
        `persona` (5–15%) → `voice_image` (~20%) → `video` (~45%, only for
        models that generate an identity video) → `lip_sync` (70–99%, the
        model-specific identity step — training/distillation for
        `expression-2` / `essence-2-light`, so the longest stage) → `done`
        (100%). A rare `essence-1` path reports `awaiting_face_marking`
        (~35%).

        Recommended polling interval: 5 seconds. Size your overall timeout to
        the model: minutes for `essence-1` / `essence-2-quality`, ~90 minutes
        for `expression-2`, and up to several hours for `essence-2-light`.
      tags: [Agent Generation]
      parameters:
        - name: agent_id
          in: path
          required: true
          schema:
            type: string
          example: "A91XMB7113"
      responses:
        "200":
          description: Agent status
          content:
            application/json:
              schema:
                type: object
                properties:
                  success:
                    type: boolean
                  data:
                    type: object
                    properties:
                      agent_id:
                        type: string
                      name:
                        type: string
                      description:
                        type: string
                      status:
                        type: string
                        enum: [processing, generating, completed, success, ready, failed]
                      system_prompt:
                        type: string
                      image_url:
                        type: string
                      video_url:
                        type: string
                      model_url:
                        type: string
                        description: URL to download the .imx model file (when status is ready).
                      voice_id:
                        type: string
                      progress:
                        type: number
                        format: float
                        minimum: 0
                        maximum: 1
                        description: Generation progress as a fraction from 0.0 to 1.0.
                      progress_msg:
                        type: string
                        description: Human-readable progress description.
                      current_step:
                        type: string
                        description: >-
                          Current generation step. Known values: `payment`,
                          `persona`, `voice_image`, `video`,
                          `awaiting_face_marking`, `lip_sync`, `done`.
                        example: "lip_sync"
                      supported_models:
                        type: array
                        items: { type: string }
                        description: >-
                          Canonical model families this agent can be launched
                          as right now. Trained families (`expression-2`,
                          `essence-2-light`) appear once their per-identity
                          model has been generated; `essence-2-quality`
                          appears whenever an image exists (prepares on
                          demand). Tier slugs inherit their family.
                        example: ["essence-2-quality", "expression-2"]
                      event_type:
                        type: string
                        description: Most recent event (e.g. "lip_created").
                      error_message:
                        type: string
                        description: Error details if status is "failed".
                      created_at:
                        type: string
                        format: date-time
                      updated_at:
                        type: string
                        format: date-time
        "404":
          $ref: "#/components/responses/NotFound"

  /v1/agent/{code}:
    get:
      operationId: getAgent
      summary: Get agent details
      description: Retrieve full details for an agent by its code.
      tags: [Agent Management]
      parameters:
        - name: code
          in: path
          required: true
          schema:
            type: string
          example: "A91XMB7113"
      responses:
        "200":
          description: Agent details
          content:
            application/json:
              schema:
                type: object
                properties:
                  success:
                    type: boolean
                  data:
                    $ref: "#/components/schemas/Agent"
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          $ref: "#/components/responses/NotFound"
    post:
      operationId: updateAgent
      summary: Update agent prompt
      description: Update an existing agent's system prompt. The agent must already exist (use `POST /v1/agent/generate` to create one).
      tags: [Agent Management]
      parameters:
        - name: code
          in: path
          required: true
          schema:
            type: string
          example: "A91XMB7113"
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              properties:
                system_prompt:
                  type: string
                  description: New system prompt for the agent.
                  example: "You are a professional sales assistant."
      responses:
        "200":
          description: Agent prompt updated
          content:
            application/json:
              schema:
                type: object
                properties:
                  agent_code:
                    type: string
                    example: "A91XMB7113"
                  updated:
                    type: boolean
                    example: true
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          $ref: "#/components/responses/NotFound"
    delete:
      operationId: deleteAgent
      summary: Delete an agent
      description: |
        Permanently delete an agent you own. Stored assets are cleaned up
        best-effort; usage history is retained. Deleting a missing or
        non-owned agent returns 404.
      tags: [Agent Management]
      security:
        - apiSecret: []
      parameters:
        - name: code
          in: path
          required: true
          schema:
            type: string
          example: "A91XMB7113"
      responses:
        "200":
          description: Agent deleted
          content:
            application/json:
              schema:
                type: object
                properties:
                  success:
                    type: boolean
                    example: true
                  agent_code:
                    type: string
                    example: "A91XMB7113"
                  deleted:
                    type: boolean
                    example: true
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          $ref: "#/components/responses/NotFound"

  /v1/agent/{code}/speak:
    post:
      operationId: agentSpeak
      summary: Make agent speak
      description: |
        Send text for the avatar to speak aloud in all active sessions.
        The agent must be in at least one active LiveKit room.
      tags: [Agent Context]
      parameters:
        - name: code
          in: path
          required: true
          schema:
            type: string
          example: "A91XMB7113"
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required: [message]
              properties:
                message:
                  type: string
                  description: Text for the avatar to speak.
                  example: "Hello! Welcome to our demo."
                room_id:
                  type: string
                  description: Target a specific room (optional, defaults to all active rooms).
      responses:
        "200":
          description: Speech triggered
          content:
            application/json:
              schema:
                type: object
                properties:
                  agent_code:
                    type: string
                    example: "A91XMB7113"
                  context_type:
                    type: string
                    example: "speak"
                  delivered_to_rooms:
                    type: integer
                    example: 1
                    description: Number of active rooms that received the message.
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          $ref: "#/components/responses/NotFound"

  /v1/agent/{code}/add-context:
    post:
      operationId: addContext
      summary: Add context to agent
      description: |
        Inject background knowledge into the agent's context. The avatar
        won't say this aloud, but will use the information in future responses.
      tags: [Agent Context]
      parameters:
        - name: code
          in: path
          required: true
          schema:
            type: string
          example: "A91XMB7113"
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required: [context]
              properties:
                context:
                  type: string
                  description: Knowledge to inject.
                  example: "The customer just purchased a premium plan."
                type:
                  type: string
                  enum: [add_context, speak]
                  default: add_context
                  description: "`add_context` for silent knowledge (default), `speak` to trigger verbal response."
                room_id:
                  type: string
                  description: Target a specific room (optional).
      responses:
        "200":
          description: Context added
          content:
            application/json:
              schema:
                type: object
                properties:
                  agent_code:
                    type: string
                  context_type:
                    type: string
                  delivered_to_rooms:
                    type: integer
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          $ref: "#/components/responses/NotFound"

  /v1/agent/{code}/models:
    post:
      operationId: addAgentModel
      summary: Add a model to an existing agent
      description: |
        Add an avatar model to an agent you already created — no need to
        re-create the agent or regenerate its persona/voice/image. The agent
        must be in the `ready` state (else `409 AGENT_NOT_READY`).

        | model | what happens | prerequisites | credits | time |
        |---|---|---|---|---|
        | `expression-1` | **instant enablement** — the v1 engine drives the agent's existing image + voice at runtime; nothing is trained | stored image + voice | **0** | immediate (this response) |
        | `expression-2` | trains the per-identity Expression 2 model from the stored image | stored image | 500 | ~10–45 min |
        | `essence-2` | the **combined** add: trains Essence 2 Light from the stored identity video; Quality becomes available from the same video at no extra charge | stored identity video (else `422`) + photorealistic-human subject (else `422 MODEL_SUBJECT_MISMATCH`) | 500 | 45 min–3 h |
        | `essence-1` | builds the v1 `.imx` from the stored identity video (or generates one from the stored image) | stored video or image | 250 | ~10–20 min |

        Asynchronous adds return `status: "processing"` — poll
        [`GET /v1/agent/status/{agent_id}`](#tag/Agent-Generation/operation/getAgentStatus)
        until `supported_models` contains the new family (`essence-2` adds
        both `essence-2-light` and `essence-2-quality`). Charges are refunded
        automatically if the add fails; re-POSTing the same model never
        double-charges, and returns `status: "ready"` with `credits: 0` once
        the model already exists.
      tags: [Agent Management]
      parameters:
        - name: code
          in: path
          required: true
          schema:
            type: string
          example: "A91XMB7113"
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required: [model]
              properties:
                model:
                  type: string
                  enum: [essence-1, essence-2, expression-1, expression-2]
                  description: >-
                    The model to add. The Essence 2 tiers are not individually
                    addable — `essence-2` is the one combined add (Light train
                    + Quality from the same identity video). Anything else
                    returns `400 VALIDATION_ERROR` listing these options.
                  example: "expression-2"
      responses:
        "200":
          description: >-
            Model added (`status: "ready"` — instant/already available) or the
            add started (`status: "processing"` — poll supported_models).
          content:
            application/json:
              schema:
                type: object
                properties:
                  success:
                    type: boolean
                  agent_id:
                    type: string
                  model:
                    type: string
                  status:
                    type: string
                    enum: [ready, processing]
                  credits:
                    type: integer
                    description: Credits charged for this add (0 for instant enablement / already available).
                  supported_models:
                    type: array
                    items:
                      type: string
                    description: The agent's launchable model families at response time.
                  message:
                    type: string
        "400":
          description: Invalid `model` (the message lists the addable options).
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          $ref: "#/components/responses/NotFound"
        "409":
          description: >-
            `AGENT_NOT_READY` — the agent is still generating or failed;
            models can only be added to a ready agent.
        "422":
          description: >-
            `MODEL_PREREQUISITE_MISSING` — the agent lacks a stored asset the
            model needs (e.g. `essence-2` needs an identity video), or
            `MODEL_SUBJECT_MISMATCH` — the Essence 2 family requires a
            photorealistic human subject and the stored image is a
            cartoon/animal/exotic creature (use `expression-2`).

  /v1/agent/{code}/model/download:
    get:
      operationId: downloadAgentModel
      summary: Download an agent's model artifact
      description: |
        Download the generated model file for an agent you own. The family
        defaults to the agent's own model; override with `?model=<family>`
        (canonical names, legacy names, and runtime tier slugs all fold onto
        their family). Artifact per family:

        | family | artifact |
        |---|---|
        | `essence-1` | `<code>.imx` (the portable IMX container) |
        | `essence-2-light` | `<code>.lebundle.imx` (unified IMX container, ~350–550 MB; **licensed weights** — local playback requires the runtime license activation flow) |
        | `essence-2-quality` | `<code>.pkl` (IMX container) |
        | `expression-2` | `<code>.avatar` (CoreML zip — the Mac-runnable form, ~90 MB) |
        | `expression-1` | — not downloadable (renders server-side from the agent's image) → `400 MODEL_NOT_DOWNLOADABLE` |

        The default response is a **302 redirect** to the artifact — the
        public URL for `essence-1`, a **1-hour signed URL** on a private
        bucket for the other families — so `curl -LOJ` and browser anchors
        just work. Pass `?redirect=false` to get the URL as JSON instead.

        The launchability gate doubles as the download gate: a family the
        agent can't be launched as (missing from `supported_models`) returns
        `409 MODEL_NOT_GENERATED`. A family the agent supports whose artifact
        hasn't been published to the download store yet returns
        `404 MODEL_ARTIFACT_NOT_READY` — the message says when to retry;
        **poll on that code**.

        The [bitHuman CLI](https://docs.bithuman.ai/sdk/cli/commands) wraps
        this endpoint: `bithuman pull <AGENT_CODE>` downloads the artifact,
        recognizes its model family, and prints what to do next.
      tags: [Agent Management]
      parameters:
        - name: code
          in: path
          required: true
          schema:
            type: string
          description: Agent code; the agent must be owned by the caller (else 404).
          example: "A17ZTB0222"
        - name: model
          in: query
          required: false
          schema:
            type: string
            enum: [essence-1, essence-2-light, essence-2-quality, expression-2]
          description: >-
            Artifact family. Defaults to the agent's own model. Legacy names
            and runtime tier slugs (e.g. `essence-2-light-ane`,
            `expression-2-gpu`) fold onto their family; an unrecognized value
            returns `400 VALIDATION_ERROR` listing the downloadable families.
          example: "expression-2"
        - name: redirect
          in: query
          required: false
          schema:
            type: string
            enum: ["false", "0", "no"]
          description: >-
            Pass `false` to receive the artifact URL as JSON (below) instead
            of a 302 redirect — for UIs that want to fetch or label first.
      responses:
        "302":
          description: >-
            Redirect to the artifact — a public URL for `essence-1`, a
            1-hour signed URL for the private families. Follow it
            (`curl -LOJ`) to download the file.
          headers:
            Location:
              schema:
                type: string
              description: The artifact URL.
        "200":
          description: With `?redirect=false` — the artifact URL as JSON.
          content:
            application/json:
              schema:
                type: object
                properties:
                  success:
                    type: boolean
                    example: true
                  data:
                    type: object
                    properties:
                      code:
                        type: string
                        example: "A17ZTB0222"
                      model:
                        type: string
                        description: The resolved canonical family.
                        example: "expression-2"
                      filename:
                        type: string
                        example: "A17ZTB0222.avatar"
                      url:
                        type: string
                        description: The artifact URL (signed, 1-hour TTL, for the private families).
                      expires_in:
                        type: integer
                        nullable: true
                        description: Signed-URL TTL in seconds (3600); `null` for the public essence-1 URL.
                        example: 3600
        "400":
          description: >-
            `VALIDATION_ERROR` — unknown `model` value (the message lists the
            downloadable families), or `MODEL_NOT_DOWNLOADABLE` — the family
            has no per-identity artifact to download (`expression-1` renders
            server-side from the agent's image).
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          description: >-
            `NOT_FOUND` — agent unknown or not owned by this account; or
            `MODEL_ARTIFACT_NOT_READY` — the family is supported but its
            artifact hasn't been published to the download store yet. The
            message says when to retry; poll on this code.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
              example:
                error:
                  code: "MODEL_ARTIFACT_NOT_READY"
                  message: "agent A17ZTB0222's expression-2 artifact isn't available for download yet: the Mac-runnable build is paced, typically within the hour"
                  httpStatus: 404
                status: "error"
                status_code: 404
        "409":
          description: >-
            `MODEL_NOT_GENERATED` — the requested family isn't in the agent's
            `supported_models` (same gate as embed/session launch), e.g.
            `agent <code>'s expression-2 model hasn't been generated yet` or
            `agent <code>'s essence-2-quality model requires a source video,
            which this agent doesn't have`.
        "429":
          description: Rate limited (read bucket) — `RATE_LIMITED`.

  /v1/files/upload:
    post:
      operationId: uploadFile
      summary: Upload a file
      description: |
        Upload images, videos, audio, or documents. Files are auto-organized
        by type. Supports URL download or direct base64 upload.
      tags: [Files]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              oneOf:
                - type: object
                  title: Upload by URL
                  required: [file_url]
                  properties:
                    file_url:
                      type: string
                      description: Public URL to download the file from.
                      example: "https://example.com/avatar.jpg"
                    file_type:
                      type: string
                      enum: [auto, image, video, audio, document]
                      default: auto
                      description: File type (auto-detected if omitted).
                - type: object
                  title: Upload by base64
                  required: [file_data, file_name]
                  properties:
                    file_data:
                      type: string
                      description: Base64-encoded file data.
                    file_name:
                      type: string
                      description: Original filename with extension.
                      example: "avatar.jpg"
                    file_type:
                      type: string
                      enum: [auto, image, video, audio, document]
                      default: auto
      responses:
        "200":
          description: File uploaded
          content:
            application/json:
              schema:
                type: object
                properties:
                  success:
                    type: boolean
                  message:
                    type: string
                    example: "File uploaded successfully"
                  data:
                    type: object
                    properties:
                      file_url:
                        type: string
                        description: Public CDN URL of the uploaded file.
                      original_source:
                        type: string
                        description: Original URL or filename.
                      file_type:
                        type: string
                        description: Detected file type category.
                      file_size:
                        type: integer
                        description: File size in bytes.
                      mime_type:
                        type: string
                        description: MIME content type.
                      asset_category:
                        type: string
                        description: Storage category (images, videos, audio, docs).
                      uploaded_at:
                        type: string
                        format: date-time
        "400":
          description: "`file_url` could not be downloaded — unreachable, non-public, or blocked (`DOWNLOAD_FAILED`); or `file_data` was not valid base64 (`VALIDATION_ERROR`)."
        "413":
          description: File too large (max 10 MB for images, 100 MB for video)
        "415":
          description: Unsupported file type

  /v1/me:
    get:
      operationId: getMe
      summary: Get current account
      description: >-
        The canonical account preflight — returns your identity, plan, and live
        credit balance in one call. Use it to show "who am I / how many credits"
        and to gate features before a billable call.
      tags: [Account]
      responses:
        "200":
          description: Account retrieved
          content:
            application/json:
              schema:
                type: object
                properties:
                  data:
                    type: object
                    properties:
                      user_id:
                        type: string
                      email:
                        type: string
                      plan:
                        type: string
                        example: "pro"
                      plan_code:
                        type: string
                        example: "membership_pro"
                      credit_balance:
                        type: integer
                        example: 5836982
                        description: Total spendable credits = plan + topup + reward.
                      plan_credits_remaining:
                        type: integer
                        example: 5055
                      topup_credits_remaining:
                        type: integer
                        example: 5831600
                      account_status:
                        type: string
                        example: "active"
                      upgrade_url:
                        type: string
                      pricing_url:
                        type: string
                  status:
                    type: string
                    example: "success"
                  status_code:
                    type: integer
                    example: 200
        "401":
          $ref: "#/components/responses/Unauthorized"

  /v1/pricing:
    get:
      operationId: getPricing
      summary: Get credit pricing
      description: >-
        Machine-readable credit pricing for billable operations, so you can
        estimate cost before a call. Authoritative charges are always enforced
        server-side at request time. For forward per-minute realtime estimates
        based on your current balance, see `GET /v2/credit-summaries`
        (`minutes_estimate`).
      tags: [Account]
      responses:
        "200":
          description: Pricing retrieved
          content:
            application/json:
              schema:
                type: object
                properties:
                  success:
                    type: boolean
                    example: true
                  data:
                    type: object
                    properties:
                      unit:
                        type: string
                        example: "credits"
                      agent_generation:
                        type: object
                        description: >-
                          Agent creation is priced **per model** — read
                          `by_model` (there is no flat rate). The
                          second-generation families cost 500 credits; the v1
                          families cost 250. `essence-2` is the combined
                          Essence 2 creation (one 500-credit charge covers
                          both tiers); `auto` charges the routed model's
                          500-credit rate. Post-generation adds via
                          `POST /v1/agent/{code}/models` charge the same
                          per-model rates (adding `expression-1` is free).
                        properties:
                          unit:
                            type: string
                            example: "credits"
                          by_model:
                            type: object
                            additionalProperties:
                              type: integer
                            description: One-time creation cost per `model` value accepted by `POST /v1/agent/generate`.
                            example:
                              essence-1: 250
                              expression-1: 250
                              essence-2-quality: 500
                              essence-2-light: 500
                              expression-2: 500
                              essence-2: 500
                              auto: 500
                          note:
                            type: string
                      talking_video:
                        type: object
                        properties:
                          unit:
                            type: string
                            example: "credits_per_minute"
                          billing:
                            type: string
                            example: "ceil(minutes) * rate, minimum 1 minute"
                          rates:
                            type: object
                            additionalProperties:
                              type: integer
                            example:
                              expression-2: 4
                              essence-2-quality: 8
                      dynamics_generation:
                        type: object
                        properties:
                          flat:
                            type: integer
                            example: 250
                          note:
                            type: string
                      notes:
                        type: string
        "401":
          $ref: "#/components/responses/Unauthorized"

  /v2/credit-summaries:
    get:
      operationId: getCreditSummaries
      summary: Check credit balance
      description: |
        Returns the current credit balance, plan details, and estimated minutes
        remaining for each usage type. Always scoped to the **authenticated
        account** (the owner of the `api-secret`) — there is no way to look up
        another user.
      tags: [Billing]
      security:
        - apiSecret: []
      parameters:
        - name: app
          in: query
          required: false
          description: App identifier for multi-app subscription support.
          schema:
            type: string
            default: imaginex
        - name: app_key
          in: query
          required: false
          description: Explicit subscription key for collection-scoped apps. Defaults to the same value as `app`.
          schema:
            type: string
      responses:
        "200":
          description: Credit balance retrieved
          content:
            application/json:
              schema:
                type: object
                properties:
                  success:
                    type: boolean
                  data:
                    type: object
                    properties:
                      user_id:
                        type: string
                        description: The authenticated account's UUID.
                      balance:
                        type: number
                        format: float
                        description: Sum of plan + topup + reward credits, returned as a float (e.g. 5910592.0).
                        example: 5910592.0
                      plan_credits:
                        type: number
                        format: float
                      topup_credits:
                        type: number
                        format: float
                      minutes_estimate:
                        type: object
                      is_enterprise:
                        type: boolean
                      isEnterprisePlanUser:
                        type: boolean
        "401":
          $ref: "#/components/responses/Unauthorized"

  /v1/embed-tokens/request:
    post:
      operationId: requestEmbedToken
      summary: Generate embed token
      description: Generate a JWT token for embedding an avatar on a website. Use this from your backend — never expose your API secret in frontend code.
      tags: [Embedding]
      security:
        - apiSecret: []
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required:
                - agent_id
                - fingerprint
              properties:
                agent_id:
                  type: string
                  description: Agent code (e.g. A78WKV4515)
                  example: "A78WKV4515"
                fingerprint:
                  type: string
                  description: Stable per-device hex string used for session tracking and per-visitor rate limiting. Required.
                  example: "9f3c8b1a2d4e5f6789ab"
                model:
                  type: string
                  description: >-
                    Optional. Request a specific avatar model for the embed
                    session — a canonical name (`essence-1`, `expression-1`,
                    `essence-2-quality`, `essence-2-light`, `expression-2`) or
                    a runtime tier slug (`expression-2-gpu/cpu/ane`,
                    `essence-2-light-gpu/cpu/ane`). This is the **early
                    artifact gate**: if the requested model is
                    `expression-2*` / `essence-2-light*` and the agent's
                    trained per-identity model hasn't been generated yet, the
                    request is rejected `409 MODEL_NOT_GENERATED` instead of
                    failing later at session dispatch. Unknown values return
                    `400 VALIDATION_ERROR` listing the accepted names.
                    Omitted → the agent's own default model.
                  example: "expression-2"
      responses:
        "200":
          description: Token generated. The returned `token` is a JWT (HS256-signed) with a 1-hour TTL.
          content:
            application/json:
              schema:
                type: object
                properties:
                  status:
                    type: string
                    example: success
                  status_code:
                    type: integer
                    example: 200
                  data:
                    type: object
                    properties:
                      token:
                        type: string
                        description: 1-hour JWT to pass as the `data-token` attribute on the embed widget script tag.
                      sid:
                        type: string
                        description: Session identifier for tracking this embed instance.
                      model:
                        type: string
                        description: The model baked into the token claim (present when `model` was requested).
                      supported_models:
                        type: array
                        items: { type: string }
                        description: >-
                          Canonical model families this agent can be launched
                          as right now (e.g. a ready expression-2 agent lists
                          `expression-2`; an agent with a source video lists
                          `essence-2-quality`, whose identity prepares on
                          demand from that video). Tier slugs inherit their
                          family and are not listed.
                        example: ["essence-2-quality", "expression-2"]
        "401":
          $ref: "#/components/responses/Unauthorized"
        "409":
          description: >-
            `MODEL_NOT_GENERATED` — the requested `model` folds to a known
            family the agent can't be launched as right now (it is missing
            from the agent's `supported_models`). For the trained families
            (`expression-2` / `essence-2-light`) the message is `agent
            <code>'s <model> model hasn't been generated yet`; for
            `essence-2-quality` it is `agent <code>'s essence-2-quality model
            requires a source video, which this agent doesn't have`. Add the
            model (`POST /v1/agent/{code}/models`) or create the agent with
            it, then retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
              example:
                error:
                  code: "MODEL_NOT_GENERATED"
                  message: "agent A56ZFX6217's expression-2 model hasn't been generated yet"
                  httpStatus: 409
                status: "error"
                status_code: 409

  /v1/dynamics/generate:
    post:
      operationId: generateDynamics
      summary: Generate gesture animations
      description: |
        Create gesture animations (wave, nod, smile, laugh, etc.) for an agent.
        These can be triggered during live sessions via keyword mapping.
      tags: [Dynamics]
      parameters:
        - name: Idempotency-Key
          in: header
          required: false
          schema: { type: string }
          description: Optional. A unique client-generated key (e.g. a UUID). Retrying with the same key returns the original response instead of starting a second billed job. Keys are retained for 24 hours.
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required: [agent_id]
              properties:
                agent_id:
                  type: string
                  example: "A91XMB7113"
                image_url:
                  type: string
                  description: Source image URL. Optional — defaults to the agent's primary image if omitted.
                duration:
                  type: integer
                  default: 5
                  description: Gesture duration in seconds.
                model:
                  type: string
                  enum: [seedance, kling, auto]
                  default: seedance
                  description: >-
                    Video motion model. `seedance` (default) or `kling`; `auto`
                    lets the platform choose. Any other value returns 400.
      responses:
        "200":
          description: Dynamics generation started
          content:
            application/json:
              schema:
                type: object
                properties:
                  success:
                    type: boolean
                  message:
                    type: string
                    example: "Dynamics generation started"
                  agent_id:
                    type: string
                  status:
                    type: string
                    example: "processing"
        "401":
          $ref: "#/components/responses/Unauthorized"
        "402":
          description: Insufficient credits. Dynamics generation costs 250 credits — INSUFFICIENT_BALANCE.
        "404":
          $ref: "#/components/responses/NotFound"

  /v1/dynamics/{agent_id}:
    get:
      operationId: getDynamics
      summary: Get available gestures
      description: List all available gesture animations for an agent.
      tags: [Dynamics]
      parameters:
        - name: agent_id
          in: path
          required: true
          schema:
            type: string
          example: "A91XMB7113"
      responses:
        "200":
          description: Available gestures
          content:
            application/json:
              schema:
                type: object
                properties:
                  success:
                    type: boolean
                  data:
                    type: object
                    properties:
                      agent_id:
                        type: string
                      url:
                        type: string
                        description: Model path or null.
                      status:
                        type: string
                        enum: [ready, generating, processing, failed]
                      gestures:
                        type: object
                        description: Map of gesture name to video URL.
                        additionalProperties:
                          type: string
        "404":
          $ref: "#/components/responses/NotFound"
    put:
      operationId: updateDynamics
      summary: Update dynamics configuration
      description: |
        Update gesture settings for an agent. After a successful update,
        movements regeneration is automatically triggered in the background.
      tags: [Dynamics]
      parameters:
        - name: agent_id
          in: path
          required: true
          schema:
            type: string
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required: [dynamics]
              properties:
                dynamics:
                  type: object
                  description: |
                    Dynamics configuration to merge with existing data.
                    Fields: `enabled` (boolean), `batch_results` (gesture results),
                    `result` (model path/hash when generated), `talking` (default model path/hash).
                  properties:
                    enabled:
                      type: boolean
                      description: Enable or disable dynamics for this agent.
                    batch_results:
                      type: object
                      description: Map of gesture names to video generation results.
                    result:
                      type: object
                      description: Result model path and hash (set when dynamics generation completes).
                    talking:
                      type: object
                      description: Default talking model path and hash (used when dynamics are disabled).
                toggle_enabled:
                  type: boolean
                  description: |
                    `true` to switch agent to dynamics model, `false` to restore default talking model.
      responses:
        "200":
          description: Dynamics updated and movements regeneration triggered
          content:
            application/json:
              schema:
                type: object
                properties:
                  success:
                    type: boolean
                  message:
                    type: string
                    example: "Dynamics updated successfully and movements regeneration started"
                  agent_id:
                    type: string
                  regeneration_status:
                    type: string
                    enum: [started, failed]
                    description: Whether background movements regeneration was triggered.
                  regeneration_error:
                    type: string
                    description: Error message if regeneration_status is "failed".

  /v1/video/generate:
    post:
      operationId: generateTalkingVideo
      summary: Generate a talking video
      description: |
        Render a talking-video (mp4) from **text** or **audio**, driven by one of
        your agents.

        **Two modes:**
        - **Async (default)** — returns immediately with a `job_id`; poll
          [`GET /v1/video/{job_id}`](#operation/getTalkingVideo) for the finished
          video. Use this for anything but the shortest clips — renders can take a
          while, so you don't want to hold the connection.
        - **Sync** — set `wait: true` (or `?wait=true`). The call blocks until the
          render finishes (up to ~90s) and returns the `video_url` directly in this
          response. If the render is still running at that cap you get the async
          response (`status: processing`) instead — just poll the `job_id`.

        - **Text input** — the agent's own voice speaks your `text` (text → speech →
          render). Pass `input.voice` to override the voice for this render.
        - **Audio input** — your `audio_url` (a hosted WAV/MP3) drives the render
          directly.

        The output is a public CDN URL. Limits: max **120 seconds** of output, max
        **5000 characters** of text.
      tags: [Video]
      parameters:
        - name: Idempotency-Key
          in: header
          required: false
          schema: { type: string }
          description: Optional. A unique client-generated key (e.g. a UUID). Retrying with the same key returns the original response instead of starting a second billed job. Keys are retained for 24 hours.
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required: [model, input, agent_code]
              properties:
                model:
                  type: string
                  enum: [expression-2, essence-2-quality, essence-2-light]
                  description: |
                    Avatar engine. `expression-2` (4 credits/min),
                    `essence-2-quality` (8 credits/min), or `essence-2-light`
                    (4 credits/min, the efficient distilled renderer).
                  example: "essence-2-quality"
                agent_code:
                  type: string
                  description: An agent you own — provides the avatar identity (and, for text input, the default voice).
                  example: "A91XMB7113"
                wait:
                  type: boolean
                  default: false
                  description: >-
                    Sync toggle. `false` (default) returns a `job_id` immediately
                    (async — poll for the result). `true` blocks until the render
                    finishes (up to ~90s) and returns the finished `video_url` in
                    this response; if it exceeds the cap you get
                    `status: processing` + a `job_id` to poll. `?wait=true` as a
                    query parameter also works.
                input:
                  type: object
                  required: [type]
                  properties:
                    type:
                      type: string
                      enum: [text, audio]
                    text:
                      type: string
                      maxLength: 5000
                      description: Required when `type` is `text`. The script to speak.
                      example: "Hello, welcome to bitHuman."
                    voice:
                      type: string
                      description: Optional voice id override for text input. Defaults to the agent's own voice.
                    audio_url:
                      type: string
                      description: Required when `type` is `audio`. A public URL to a WAV or MP3 file.
                      example: "https://example.com/speech.wav"
      responses:
        "200":
          description: >-
            Async (or sync-timed-out): `status: processing` + a `job_id` to poll.
            Sync completed: `status: completed` with `video_url`, `duration_seconds`,
            and `credits_charged`.
          content:
            application/json:
              schema:
                type: object
                properties:
                  success:
                    type: boolean
                  job_id:
                    type: string
                    example: "vid_3f9a2c1b8e7d4a6f0b21"
                  status:
                    type: string
                    enum: [processing, completed]
                    example: "processing"
                  video_url:
                    type: string
                    description: Public mp4 URL — present only on a completed sync render (wait=true).
                  duration_seconds:
                    type: number
                    format: float
                    description: Output duration — present on a completed sync render.
                  credits_charged:
                    type: integer
                    description: Credits charged — present on a completed sync render.
        "400":
          description: Validation error — invalid model, missing input, or text/audio missing.
        "401":
          $ref: "#/components/responses/Unauthorized"
        "402":
          description: Insufficient credits — INSUFFICIENT_BALANCE. The cost is `ceil(minutes) × rate` (expression-2 = 4, essence-2-quality = 8, essence-2-light = 4 credits/min).
        "404":
          $ref: "#/components/responses/NotFound"
        "409":
          description: >-
            `MODEL_NOT_GENERATED` — checked **before any charge**: the
            requested `model` is a family this agent can't be launched as. For
            `expression-2` / `essence-2-light` that means the trained
            per-identity model hasn't been generated yet (message: `agent
            <code>'s <model> model hasn't been generated yet`);
            `essence-2-quality` is gated on the agent's **source video** — its
            identity prepares on demand from that footage, so an agent without
            one gets `agent <code>'s essence-2-quality model requires a source
            video, which this agent doesn't have`.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
              example:
                error:
                  code: "MODEL_NOT_GENERATED"
                  message: "agent A56ZFX6217's essence-2-light model hasn't been generated yet"
                  httpStatus: 409
                status: "error"
                status_code: 409

  /v1/video/{job_id}:
    get:
      operationId: getTalkingVideo
      summary: Get talking-video status
      description: |
        Poll a talking-video render job. While rendering, `status` is `processing`.
        When done, `status` is `completed` and the response carries `video_url`,
        `duration_seconds`, and `credits_charged`. On failure, `status` is `failed`
        and the up-front charge is automatically refunded.
      tags: [Video]
      parameters:
        - name: job_id
          in: path
          required: true
          schema:
            type: string
          example: "vid_3f9a2c1b8e7d4a6f0b21"
      responses:
        "200":
          description: Job status
          content:
            application/json:
              schema:
                type: object
                properties:
                  success:
                    type: boolean
                  job_id:
                    type: string
                  status:
                    type: string
                    enum: [processing, completed, failed]
                  model:
                    type: string
                  video_url:
                    type: string
                    description: Public mp4 URL (present when status is `completed`).
                  duration_seconds:
                    type: number
                    format: float
                    description: Output duration in seconds (present when `completed`).
                  credits_charged:
                    type: integer
                    description: Credits charged for this render (present when `completed`).
                  error:
                    type: object
                    description: Failure detail (present when status is `failed`).
                    properties:
                      message:
                        type: string
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          $ref: "#/components/responses/NotFound"

  /v1/tts:
    post:
      operationId: synthesizeSpeech
      summary: Synthesize speech from text
      description: |
        Generates audio (WAV or PCM) from a text prompt. Pick a built-in voice
        (`M1`-`F5`), tune it inline with `axes`, or pass a `voice_code` from the
        [Voice Designer](https://www.bithuman.ai/voice). Supports 30+ languages,
        per-request quality / speed tuning, and optional sentence-by-sentence
        streaming. An unknown `voice_code` returns `404 VOICE_NOT_FOUND`.
      tags: [Voice]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required: [text]
              properties:
                text:
                  type: string
                  description: Text to synthesize (any length; multi-sentence supported).
                  example: "Hello from bitHuman."
                voice:
                  type: string
                  description: Voice ID. Built-in voices are `M1`-`M5` (male) and `F1`-`F5` (female). List them with `GET /v1/voices`.
                  default: "M1"
                  example: "F1"
                voice_code:
                  type: string
                  description: |
                    A designed-voice handle from the [Voice Designer](https://www.bithuman.ai/voice).
                    Expands server-side to a base voice + tuning, so you don't send `voice` / `axes`
                    yourself. A UUID references a saved voice; a `bv1_…` code self-encodes the config.
                    Takes precedence over `voice` when present.
                  example: "f8fb5feb-8a19-435c-89e5-a286a03565ec"
                axes:
                  type: object
                  description: |
                    Inline voice tuning — the same semantic axes as `GET /v1/studio/axes`
                    (`gender`, `pitch`, `rate`, `brightness`). Offsets are small (roughly −0.3…0.3);
                    `0` is neutral. Ignored when `voice_code` is supplied.
                  additionalProperties:
                    type: number
                  example:
                    gender: 0.1
                    pitch: 0.05
                    rate: -0.1
                    brightness: 0.2
                language:
                  type: string
                  description: ISO-2 language code. 30+ languages supported (the advertised count and the live server list don't always match exactly).
                  default: "en"
                  example: "en"
                total_steps:
                  type: integer
                  minimum: 1
                  maximum: 100
                  default: 8
                  description: Denoise step count. `5`=fast/lower quality, `8`=balanced, `12`=highest.
                speed:
                  type: number
                  minimum: 0.7
                  maximum: 2.0
                  default: 1.05
                  description: Playback rate.
                format:
                  type: string
                  enum: [wav, pcm_s16le]
                  default: wav
                  description: Response audio format.
                stream:
                  type: boolean
                  default: false
                  description: If true, returns sentence-chunked length-prefixed PCM frames.
      responses:
        "200":
          description: Audio bytes (WAV or PCM, per request `format`).
          content:
            audio/wav:
              schema:
                type: string
                format: binary
            audio/pcm:
              schema:
                type: string
                format: binary
        "400":
          $ref: "#/components/responses/BadRequest"
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          description: "`voice_code` did not resolve to a known voice (`VOICE_NOT_FOUND`)."
        "503":
          description: Queue full / service overloaded. Retry with backoff.

  /v1/voices:
    get:
      operationId: listVoices
      summary: List available voices
      description: Returns the catalog of built-in and custom voices.
      tags: [Voice]
      responses:
        "200":
          description: Voice catalog
          content:
            application/json:
              schema:
                type: object
                properties:
                  voices:
                    type: array
                    items:
                      type: object
                      properties:
                        id:
                          type: string
                          example: "F1"
                        kind:
                          type: string
                          enum: [builtin, custom]
                          example: "builtin"
        "401":
          $ref: "#/components/responses/Unauthorized"

  /v1/audio/speech:
    post:
      operationId: openaiCompatSpeech
      summary: OpenAI-compatible speech endpoint
      description: |
        Drop-in compatible with OpenAI's `POST /v1/audio/speech`. Useful for
        migrating existing OpenAI clients to bitHuman without code changes —
        swap the base URL to `https://api.bithuman.ai/v1` and the auth
        header to `api-secret`.
      tags: [Voice]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required: [input]
              properties:
                model:
                  type: string
                  description: Ignored — bitHuman uses its own TTS model. Pass `"tts-1"` for compatibility.
                  example: "tts-1"
                input:
                  type: string
                  description: Text to synthesize.
                  example: "Hello from bitHuman."
                voice:
                  type: string
                  description: bitHuman voice ID (`M1`-`F5`).
                  default: "M1"
                response_format:
                  type: string
                  enum: [wav]
                  default: wav
                speed:
                  type: number
                  minimum: 0.7
                  maximum: 2.0
                  default: 1.05
      responses:
        "200":
          description: Audio bytes (WAV by default).
          content:
            audio/wav:
              schema:
                type: string
                format: binary
        "400":
          $ref: "#/components/responses/BadRequest"
        "401":
          $ref: "#/components/responses/Unauthorized"

  /v1/studio/axes:
    get:
      operationId: listVoiceAxes
      summary: List voice-tuning semantic axes
      description: |
        Returns the available semantic axes (gender, pitch, rate, brightness)
        plus how each built-in voice projects onto them and the suggested
        slider range. Powers the Tune popover in the Voice Playground.
      tags: [Voice]
      responses:
        "200":
          description: Axis catalog
          content:
            application/json:
              schema:
                type: object
                properties:
                  axes:
                    type: array
                    items:
                      type: object
                      properties:
                        name:
                          type: string
                          example: "pitch"
                        description:
                          type: string
                          example: "Pitch: − lower, + higher"
                        suggested_min:
                          type: number
                          example: -0.28
                        suggested_max:
                          type: number
                          example: 0.28
                        anchor_projections:
                          type: object
                          additionalProperties:
                            type: number
                          description: Per-voice-ID projection onto this axis.
        "401":
          $ref: "#/components/responses/Unauthorized"

  /v1/studio/preview:
    post:
      operationId: previewTunedVoice
      summary: Preview an axis-edited voice
      description: |
        Synthesize a one-off audio clip using a built-in voice edited along
        the semantic axes returned by `GET /v1/studio/axes`. The edited voice
        is ephemeral — it is NOT persisted in the voice catalog.
      tags: [Voice]
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required: [text, base_voice]
              properties:
                text:
                  type: string
                  example: "Tuned voice preview."
                base_voice:
                  type: string
                  description: Built-in voice ID (`M1`-`F5`) to edit.
                  example: "F1"
                axes:
                  type: object
                  description: Optional map of axis-name → float value. Omit to preview the unedited base voice.
                  additionalProperties:
                    type: number
                  example:
                    pitch: 0.15
                    brightness: -0.10
                language:
                  type: string
                  default: "en"
      responses:
        "200":
          description: Audio bytes (WAV).
          content:
            audio/wav:
              schema:
                type: string
                format: binary
        "400":
          $ref: "#/components/responses/BadRequest"
        "401":
          $ref: "#/components/responses/Unauthorized"

  /v1/agents:
    get:
      operationId: listAgents
      summary: List agents
      description: |
        List the agents owned by your API secret, newest first. Paginated.
      tags: [Agent Management]
      security:
        - apiSecret: []
      parameters:
        - name: limit
          in: query
          description: Page size (1–100).
          schema: { type: integer, default: 20, minimum: 1, maximum: 100 }
        - name: offset
          in: query
          description: Number of agents to skip.
          schema: { type: integer, default: 0, minimum: 0 }
        - name: status
          in: query
          description: Filter by generation state (e.g. `ready`, `processing`, `failed`).
          schema: { type: string }
      responses:
        "200":
          description: A page of agents
          content:
            application/json:
              schema:
                type: object
                properties:
                  success: { type: boolean, example: true }
                  data:
                    type: array
                    items: { $ref: "#/components/schemas/Agent" }
                  pagination:
                    $ref: "#/components/schemas/Pagination"
        "401":
          $ref: "#/components/responses/Unauthorized"

  /v1/usage:
    get:
      operationId: getUsage
      summary: Usage history
      description: |
        Return your account's usage / activity history, newest first. Each row
        is a metered event. `credits_change` is the signed credit delta — usage
        events are recorded as **positive** credits consumed.
      tags: [Billing]
      security:
        - apiSecret: []
      parameters:
        - name: limit
          in: query
          description: Page size (1–200).
          schema: { type: integer, default: 50, minimum: 1, maximum: 200 }
        - name: offset
          in: query
          schema: { type: integer, default: 0, minimum: 0 }
        - name: start
          in: query
          description: ISO-8601 timestamp; only return events at/after this time.
          schema: { type: string, format: date-time }
        - name: end
          in: query
          description: ISO-8601 timestamp; only return events at/before this time.
          schema: { type: string, format: date-time }
        - name: agent_code
          in: query
          description: Only return events for this agent.
          schema: { type: string }
      responses:
        "200":
          description: A page of usage events
          content:
            application/json:
              schema:
                type: object
                properties:
                  success: { type: boolean, example: true }
                  data:
                    type: array
                    items: { $ref: "#/components/schemas/UsageEvent" }
                  pagination:
                    $ref: "#/components/schemas/Pagination"
        "401":
          $ref: "#/components/responses/Unauthorized"

  /v1/webhooks:
    post:
      operationId: createWebhook
      summary: Create a webhook
      description: |
        Register an HTTPS endpoint to receive signed event notifications. The
        response includes a one-time `secret` used to verify the
        `X-BitHuman-Signature` header — store it; it is never returned again.
      tags: [Webhooks]
      security:
        - apiSecret: []
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required: [url]
              properties:
                url:
                  type: string
                  description: HTTPS endpoint to deliver events to.
                  example: "https://example.com/bithuman/webhook"
                events:
                  type: array
                  description: Event types to subscribe to. Omit or pass `[]` to receive all.
                  items:
                    type: string
                    enum: [agent.ready, agent.failed]
                  example: ["agent.ready", "agent.failed"]
                description:
                  type: string
                  description: Optional human-readable label.
      responses:
        "200":
          description: Webhook created (includes the one-time signing secret)
          content:
            application/json:
              schema:
                type: object
                properties:
                  success: { type: boolean, example: true }
                  data:
                    $ref: "#/components/schemas/WebhookWithSecret"
        "401":
          $ref: "#/components/responses/Unauthorized"
    get:
      operationId: listWebhooks
      summary: List webhooks
      description: List your registered webhooks (signing secrets are redacted).
      tags: [Webhooks]
      security:
        - apiSecret: []
      responses:
        "200":
          description: Your webhooks
          content:
            application/json:
              schema:
                type: object
                properties:
                  success: { type: boolean, example: true }
                  data:
                    type: array
                    items: { $ref: "#/components/schemas/Webhook" }
        "401":
          $ref: "#/components/responses/Unauthorized"

  /v1/webhooks/{webhook_id}:
    get:
      operationId: getWebhook
      summary: Get a webhook
      tags: [Webhooks]
      security:
        - apiSecret: []
      parameters:
        - name: webhook_id
          in: path
          required: true
          schema: { type: string, format: uuid }
      responses:
        "200":
          description: The webhook (secret redacted)
          content:
            application/json:
              schema:
                type: object
                properties:
                  success: { type: boolean }
                  data: { $ref: "#/components/schemas/Webhook" }
        "404":
          $ref: "#/components/responses/NotFound"
    delete:
      operationId: deleteWebhook
      summary: Delete a webhook
      tags: [Webhooks]
      security:
        - apiSecret: []
      parameters:
        - name: webhook_id
          in: path
          required: true
          schema: { type: string, format: uuid }
      responses:
        "200":
          description: Deleted
          content:
            application/json:
              schema:
                type: object
                properties:
                  success: { type: boolean, example: true }
                  deleted: { type: boolean, example: true }
        "404":
          $ref: "#/components/responses/NotFound"

  /v1/webhooks/{webhook_id}/test:
    post:
      operationId: testWebhook
      summary: Send a test event
      description: Deliver a one-off `ping` event to verify the endpoint is reachable.
      tags: [Webhooks]
      security:
        - apiSecret: []
      parameters:
        - name: webhook_id
          in: path
          required: true
          schema: { type: string, format: uuid }
      responses:
        "200":
          description: Delivery attempted
          content:
            application/json:
              schema:
                type: object
                properties:
                  success: { type: boolean }
                  data:
                    type: object
                    properties:
                      delivered: { type: boolean, example: true }
                      status_code: { type: integer, example: 200 }
                      attempts: { type: integer, example: 1 }
        "404":
          $ref: "#/components/responses/NotFound"

  /v1/webhooks/{webhook_id}/deliveries:
    get:
      operationId: listWebhookDeliveries
      summary: List delivery attempts
      description: Recent delivery attempts for a webhook (most recent first).
      tags: [Webhooks]
      security:
        - apiSecret: []
      parameters:
        - name: webhook_id
          in: path
          required: true
          schema: { type: string, format: uuid }
      responses:
        "200":
          description: Delivery log
          content:
            application/json:
              schema:
                type: object
                properties:
                  success: { type: boolean }
                  data:
                    type: array
                    items: { $ref: "#/components/schemas/WebhookDelivery" }
        "404":
          $ref: "#/components/responses/NotFound"

components:
  securitySchemes:
    apiSecret:
      type: apiKey
      name: api-secret
      in: header
      description: |
        Your bitHuman API secret. Get one at
        [www.bithuman.ai/#developer](https://www.bithuman.ai/#developer).

  schemas:
    Agent:
      type: object
      properties:
        code:
          type: string
          description: The agent's short code — the primary identifier used across the platform and the embed/viewer URLs.
          example: "A91XMB7113"
        agent_id:
          type: string
          example: "A91XMB7113"
        name:
          type: string
          example: "Fitness Coach"
        description:
          type: string
        language:
          type: string
        gender:
          type: string
        model:
          type: string
        supported_models:
          type: array
          items: { type: string }
          description: >-
            Canonical model families this agent can be launched as right now
            — `essence-1` when its `.imx` exists, `essence-2-quality` when a
            **source video** exists (its identity prepares on demand from
            that footage), and `essence-2-light` / `expression-2` once their
            trained per-identity model has been generated. Tier slugs inherit
            their family and are not listed.
          example: ["essence-1", "essence-2-quality"]
        subject:
          type: string
        system_prompt:
          type: string
          example: "You are a friendly fitness coach."
        image_url:
          type: string
          description: Avatar face image URL.
        video_url:
          type: string
          description: Avatar source video URL.
        model_url:
          type: string
          description: Downloadable .imx model URL.
        voice_id:
          type: string
          description: Voice identifier.
        status:
          type: string
          description: Current agent state.
        event_type:
          type: string
        progress:
          type: number
          format: float
          minimum: 0
          maximum: 1
          description: Generation progress as a fraction from 0.0 to 1.0.
        progress_msg:
          type: string
        current_step:
          type: string
        error_message:
          type: string
        created_at:
          type: string
          format: date-time
        updated_at:
          type: string
          format: date-time
        dynamics:
          type: object
          description: Gesture configuration and available gestures.

    Pagination:
      type: object
      properties:
        limit:
          type: integer
          example: 20
        offset:
          type: integer
          example: 0
        total:
          type: integer
          description: Total matching rows across all pages.
          example: 137
        has_more:
          type: boolean
          example: true

    UsageEvent:
      type: object
      properties:
        id:
          type: string
          format: uuid
        activity_type:
          type: string
          example: "bithuman-serve"
        pricing_code:
          type: string
          example: "usage_essence_model_self_hosted"
        app:
          type: string
          example: "imaginex"
        agent_code:
          type: string
          nullable: true
        credits_change:
          type: number
          description: Signed credit delta; usage events are positive credits consumed.
          example: 1
        frames_rendered:
          type: integer
          nullable: true
        room_name:
          type: string
          nullable: true
        start_time:
          type: string
          format: date-time
          nullable: true
        end_time:
          type: string
          format: date-time
          nullable: true
        created_at:
          type: string
          format: date-time

    Webhook:
      type: object
      properties:
        id:
          type: string
          format: uuid
        url:
          type: string
          example: "https://example.com/bithuman/webhook"
        events:
          type: array
          items:
            type: string
          description: Subscribed event types; empty means all.
          example: ["agent.ready", "agent.failed"]
        description:
          type: string
          nullable: true
        active:
          type: boolean
          example: true
        created_at:
          type: string
          format: date-time
        updated_at:
          type: string
          format: date-time

    WebhookWithSecret:
      allOf:
        - $ref: "#/components/schemas/Webhook"
        - type: object
          properties:
            secret:
              type: string
              description: |
                HMAC-SHA256 signing secret, returned **only once** at creation.
                Verify deliveries with
                `sha256 = HMAC(secret, "{X-BitHuman-Timestamp}.{raw_body}")`
                and compare against the `X-BitHuman-Signature` header.
              example: "whsec_60fe3d…"

    WebhookDelivery:
      type: object
      properties:
        id:
          type: string
          format: uuid
        event_type:
          type: string
          example: "agent.ready"
        status_code:
          type: integer
          nullable: true
          description: HTTP status returned by your endpoint; null on network error/timeout.
          example: 200
        attempts:
          type: integer
          example: 1
        delivered:
          type: boolean
          example: true
        created_at:
          type: string
          format: date-time
        updated_at:
          type: string
          format: date-time

    Error:
      type: object
      properties:
        error:
          type: object
          properties:
            code:
              type: string
              example: "UNAUTHORIZED"
            message:
              type: string
              example: "Invalid API secret"
            httpStatus:
              type: integer
              example: 401
        status:
          type: string
          example: "error"
        status_code:
          type: integer
          example: 401

  responses:
    BadRequest:
      description: Missing required parameter or malformed request. The HTTP status equals `status_code` (400).
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/Error"
          example:
            error:
              code: "MISSING_PARAM"
              message: "Missing required field: text"
              httpStatus: 400
            status: "error"
            status_code: 400

    Unauthorized:
      description: |
        Authentication failed. `UNAUTHORIZED` when the `api-secret` header is
        present but invalid; `MISSING_AUTH` when it is absent. The HTTP status
        equals `status_code` (401).
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/Error"
          example:
            error:
              code: "UNAUTHORIZED"
              message: "Invalid API secret"
              httpStatus: 401
            status: "error"
            status_code: 401

    NotFound:
      description: Resource not found
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/Error"
          example:
            error:
              code: "NOT_FOUND"
              message: "Agent not found"
              httpStatus: 404
            status: "error"
            status_code: 404

    ValidationError:
      description: Body failed schema validation. The HTTP status equals `status_code` (400).
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/Error"
          example:
            error:
              code: "VALIDATION_ERROR"
              message: "Invalid request parameters"
              httpStatus: 400
            status: "error"
            status_code: 400

x-tagGroups:
  - name: Get started
    tags:
      - overview
      - quickstart
      - auth-guide
      - models
      - pricing
  - name: Core concepts
    tags:
      - avatars-imx
      - audio-streaming
      - agent-lifecycle
  - name: SDKs
    tags:
      - sdk-python
      - sdk-swift
      - sdk-flutter
      - cli
  - name: Guides
    tags:
      - local-mode
      - building-avatars
  - name: Deployment
    tags:
      - deploy-livekit
      - deploy-self-hosted
      - deploy-embed
  - name: API reference
    tags:
      - Authentication
      - Voice
      - Agent Generation
      - Agent Management
      - Agent Context
      - Files
      - Dynamics
      - Embedding
      - Billing
      - Account
      - Webhooks
  - name: Resources
    tags:
      - community
      - changelog
      - support

tags:
  ##########################################################################
  # Prose-only sections (no operations). Scalar renders these as standalone
  # pages in the sidebar via x-tagGroups.
  ##########################################################################
  - name: overview
    x-displayName: Overview
    description: |
      ## What bitHuman is

      bitHuman puts a real-time, lip-synced talking avatar in your
      app. Send audio in, get animated video out at 25 FPS with
      sub-200 ms latency. The rendering runs **on the device** — no
      cloud round-trip, no PII leaving the box, no network dependency
      for the avatar itself.

      ### Three places it can render

      | Where | When to use it |
      |---|---|
      | **Cloud** *(default)* | Managed avatar runtime, video published over LiveKit. ~5 min setup with the [LiveKit plugin](#tag/deploy-livekit). |
      | **On-device** | Your machine via the [bitHuman CLI](#tag/cli) or any [SDK](#tag/sdk-python). Optional [fully on-device brain](#tag/local-mode). |
      | **Browser** | ONNX Runtime Web (WASM) renders in the user's tab; the server runs only the brain. |

      ### One engine, every surface

      A single Rust engine (`libessence`) drives the bitHuman CLI, the
      Python / Swift / Flutter SDKs, and the cloud REST API.
      They all read the same `.imx` file and produce identical
      frames — pick the surface that matches what you're building;
      the mental model never changes.

      ### Two avatar models

      - **[Essence](#tag/models)** — a pre-built avatar identity baked
        into an `.imx`. Low memory, runs on every supported platform.
        **The default.**
      - **[Expression](#tag/models)** — animates *any* portrait at
        runtime. Higher close-up quality; needs more compute.

      Both speak the same `.imx` format and the same SDK methods —
      swap them without rewriting your integration.

      ### What you can build

      - **Voice-only chat** — TTS in 30+ languages via [`/v1/tts`](#tag/Voice/operation/synthesizespeech)
        or the [OpenAI-compatible drop-in](#tag/Voice/operation/openaicompatspeech).
      - **Animated avatars** — make a hosted agent talk via
        [`/v1/agent/{code}/speak`](#tag/Agent-Context/operation/agentSpeak).
      - **Generate new avatars** — turn a prompt + portrait + voice
        sample into a new agent via [`/v1/agent/generate`](#tag/Agent-Generation/operation/generateAgent).
      - **Embed in any app** — REST from any language, native SDKs,
        drop-in LiveKit plugin, Docker image for self-hosted GPU,
        browser iframe.

      <div class="bh-cards">
        <a href="#tag/quickstart" class="bh-card">
          <strong>Start in 2 minutes →</strong>
          <span>Install the CLI, run your first avatar</span>
        </a>
        <a href="#tag/models" class="bh-card">
          <strong>Pick a model →</strong>
          <span>Essence vs Expression</span>
        </a>
        <a href="#tag/Voice" class="bh-card">
          <strong>Try the Voice API →</strong>
          <span>One curl, 30+ languages</span>
        </a>
        <a href="https://www.bithuman.ai/voice" class="bh-card">
          <strong>Voice playground →</strong>
          <span>Hear all 10 voices live</span>
        </a>
      </div>

  - name: quickstart
    x-displayName: Quickstart
    description: |
      ## Your first talking avatar in 2 minutes

      The fastest path is the CLI — no code, no toolchain. Once it
      works, the [SDKs](#tag/sdk-python) do the same thing in your
      language.

      ### 1. Get an API secret

      Sign in at [bithuman.ai](https://www.bithuman.ai) (free tier, no
      credit card), then go to
      [Developer → API Keys](https://www.bithuman.ai/#developer) and
      click **Create new key**. Copy the value — **you won't be able
      to view it again** — and store it somewhere durable.

      ### 2. Install the CLI

      Three install paths, same Rust binary:

      ```bash
      # macOS Homebrew — recommended
      brew install bithuman-product/bithuman/bithuman-cli

      # Universal installer — macOS (Apple Silicon) + Linux, no Python required
      curl -fsSL https://raw.githubusercontent.com/bithuman-product/homebrew-bithuman/main/install.sh | sh

      # PyPI sibling wheel — Python-only environments
      pip install bithuman-cli
      ```

      Verify the install:

      ```bash
      export BITHUMAN_API_SECRET=your_api_secret
      bithuman --version
      bithuman doctor              # check creds, brain, caches
      ```

      ### 3. Make it talk

      ```bash
      export OPENAI_API_KEY=sk-...                              # cloud brain
      bithuman pull modern-court-jester                          # any slug from `bithuman list`
      bithuman run ~/.cache/bithuman/showcase/modern-court-jester.imx
      # → open the printed http://127.0.0.1:8088/<CODE> URL, grant mic, talk
      ```

      Skip the cloud entirely with the **on-device brain** — no API
      key, no outbound network, ~1.5 GB RAM:

      ```bash
      pip install 'bithuman-cli[local]'
      BITHUMAN_LOCAL=1 bithuman run ~/.cache/bithuman/showcase/modern-court-jester.imx
      ```

      First run downloads ~860 MB of brain models from HuggingFace
      (~90 s once). Subsequent runs warm-load in under a second.
      [Local mode →](#tag/local-mode).

      Or render an MP4 offline — no browser, no brain — just lipsync
      a WAV you already have:

      ```bash
      bithuman render ~/.cache/bithuman/showcase/modern-court-jester.imx \
        --audio speech.wav --output demo.mp4
      ```

      ### 4. Now in your language

      Same engine, same `.imx`, same `push audio → drain frames`
      shape. Minimal Python:

      ```python
      import asyncio, os
      import numpy as np
      import soundfile as sf
      from bithuman import AsyncBithuman

      # bithuman 2.3 is library-only; inline the former bithuman.audio
      # helpers (the SDK resamples to 16 kHz internally).
      def load_audio(path):
          audio, sr = sf.read(path, dtype="float32", always_2d=False)
          if audio.ndim > 1:
              audio = audio.mean(axis=1)
          return audio, sr

      def float32_to_int16(arr):
          return (np.clip(arr, -1.0, 1.0) * 32767.0).astype(np.int16)

      async def main():
          rt = await AsyncBithuman.create(
              model_path="avatar.imx",
              api_secret=os.environ["BITHUMAN_API_SECRET"],
          )

          pcm, sr = load_audio("speech.wav")
          pcm = float32_to_int16(pcm)
          chunk = sr // 100                       # 10 ms chunks
          for i in range(0, len(pcm), chunk):
              await rt.push_audio(pcm[i:i+chunk].tobytes(), sr, last_chunk=False)
          await rt.flush()

          async for frame in rt.run():
              if frame.has_image:
                  image = frame.bgr_image         # numpy (H, W, 3) uint8
              if frame.end_of_speech:
                  break
          await rt.stop()

      asyncio.run(main())
      ```

      | Concept | What it is |
      |---|---|
      | `AsyncBithuman` | The runtime. One per avatar session. |
      | `push_audio(bytes, sr, last_chunk)` | Feed 16-bit PCM; avatar lip-syncs live. |
      | `flush()` | Mark end of audio input. |
      | `run()` | Async generator yielding frames at 25 FPS. |
      | `frame` | `.bgr_image`, `.audio_chunk`, `.has_image`, `.end_of_speech`. |

      ### Next

      - [Models](#tag/models) — Essence vs Expression
      - [Authentication](#tag/auth-guide) — set keys for every SDK
      - [Python SDK](#tag/sdk-python) · [Swift SDK](#tag/sdk-swift)
      - [Browse the API endpoints →](#tag/Voice)

  - name: auth-guide
    x-displayName: Authentication
    description: |
      ## Authentication

      bitHuman uses a single shared credential per account that
      authenticates every SDK and the REST API. Two equivalent env-var
      names depending on which SDK you're using:

      - **`BITHUMAN_API_SECRET`** — Python SDK, REST API, LiveKit
        plugin, CLI.
      - **`BITHUMAN_API_KEY`** — Swift SDK on Apple platforms. Same
        value, different env-var name to match Apple convention.

      ### Get a key

      1. Sign in at [bithuman.ai](https://www.bithuman.ai).
      2. Go to [Developer → API Keys](https://www.bithuman.ai/#developer).
      3. Click **Create new key**, name it, and copy the value.

      Verify it works:

      ```bash
      curl -X POST https://api.bithuman.ai/v1/validate \
        -H "api-secret: YOUR_KEY"
      ```

      A 200 with `{"valid": true, ...}` means you're good.

      ### Set it for your SDK

      **Python SDK** — env var, or pass directly:

      ```python
      runtime = await AsyncBithuman.create(
          model_path="avatar.imx",
          api_secret="your_key",
      )
      ```

      **REST API** — `api-secret` header on every request:

      ```bash
      curl -X POST https://api.bithuman.ai/v1/agent/A78WKV4515/speak \
        -H "api-secret: $BITHUMAN_API_SECRET" \
        -H "content-type: application/json" \
        -d '{"message": "Hello"}'
      ```

      **Swift SDK** — env var or explicit:

      ```swift
      config.apiKey = ProcessInfo.processInfo.environment["BITHUMAN_API_KEY"]
      // or
      config.apiKey = await fetchFromBackend()    // production: Keychain
      ```

      **Never hardcode the key in source.** For DMG distribution, bake
      it into Info.plist via a build script. For App Store, fetch from
      your own backend via Keychain on first launch.

      ### Audio-only Swift mode is unmetered

      If you only want on-device voice chat (no lip-synced avatar),
      **skip the API key entirely**:

      ```swift
      var config = VoiceChatConfig()
      config.systemPrompt = "You are a helpful assistant."
      config.voice = .preset("Aiden")
      // no config.avatar = ...

      let chat = VoiceChat(config: config)
      try await chat.start()  // does not authenticate
      ```

      This mode runs fully offline (after first-launch weight
      downloads), bills nothing, and doesn't require a key.

      ### How the flow works

      1. Your code provides the API secret to the SDK / REST request.
      2. The SDK exchanges it for a short-lived **runtime token** at
         `https://api.bithuman.ai/v1/runtime-tokens/request`.
      3. That token authenticates the actual avatar engine (heartbeat
         + frame production).
      4. Tokens auto-renew every minute via the heartbeat.
      5. Bad keys fail at step 2 — fast — before any user-visible work.

      The `BITHUMAN_API_SECRET` never leaves your machine for any
      destination other than `api.bithuman.ai` over TLS.

      ### Rotating keys

      Rotate from the [Developer dashboard](https://www.bithuman.ai/#developer).
      Rotation invalidates the old key immediately; live sessions
      pause within ~60 s and resume on restart with the new key.
      There's no overlap window — rotate during a maintenance window
      if you have production sessions running.

  - name: models
    x-displayName: Models
    description: |
      ## Essence vs Expression

      Two avatar models, same SDK methods, same `.imx` format — swap
      them without rewriting your integration.

      | Model | What it is | Best for |
      |---|---|---|
      | **Essence** | A pre-built avatar identity in an `.imx`. Low memory, runs on every supported platform. **The default.** | Kiosks, mobile, edge, high-concurrency servers |
      | **Expression** | Animates *any* portrait at runtime. Higher close-up quality; needs more compute. | Native desktop apps, per-session custom faces, GPU cloud |

      Both speak the same `push audio → drain frames` shape across
      Python, Swift, and Flutter.

      ### Essence

      The default model. One `.imx` file = one avatar identity.
      Pre-built at upload time, served by the lightweight
      `libessence` runtime — runs on CPU (Raspberry Pi to server),
      NVIDIA GPU, Apple Silicon (Mac/iPad/iPhone), and in the browser
      via WASM.

      Pick Essence when you want:

      - Predictable resource use (low memory, no GPU required)
      - Many concurrent sessions on the same machine
      - Mobile / embedded / edge deployments
      - Browser-side rendering

      ### Expression

      Real-time animation of *any* portrait. Higher close-up quality,
      especially for nuanced facial expressions, but requires a GPU
      and more memory. Ships as a Docker image for self-hosted
      deployment, and is built into the Swift SDK on Apple Silicon
      M3+ (Mac) and M4+ (iPad Pro) — auto-spawned
      `bithuman-expression-daemon`.

      Pick Expression when you want:

      - Per-session custom faces (one user's portrait → their avatar)
      - The highest visual quality at close range
      - Cloud GPU deployments (NVIDIA + TRT) or M3+ Mac / M4+ iPad apps

      ### The second generation (GA 2026-07-01)

      Three second-generation models are generally available:

      | Model | What it is | Serving tiers | Cloud | Self-hosted |
      |---|---|---|---|---|
      | `expression-2` | Second-gen expression engine — audio-driven real-time avatar video from a single photo (creation trains a per-identity model, ~45 min) | gpu · cpu · ane | 4 cr/min | 2 cr/min |
      | `essence-2-quality` | Highest-fidelity Essence tier (cloud GPU; identity prepped from your source video in seconds) | gpu | 8 cr/min | 4 cr/min |
      | `essence-2-light` | Cost-effective distilled Essence tier — runs on gpu, cpu, and ane (Apple Neural Engine), including on-device (creation distills an identity bundle, typically 25–40 min) | gpu · cpu · ane | 4 cr/min | 2 cr/min |

      Create one with
      [`POST /v1/agent/generate`](#tag/Agent-Generation/operation/generateAgent)
      and `model: "expression-2"` (same shape for the other two); serve it
      through the normal session flows — e.g. embed agent `A56ZFX6217` and the
      platform routes the session to the model's best available tier.
      `model: "essence-2"` is the **combined Essence 2 creation** — one
      500-credit charge covers both tiers (pick the tier at launch) — and
      `model: "auto"` classifies your input and routes it (photorealistic
      person → `essence-2`, cartoon/animal/exotic → `expression-2`). You can
      also add a model to an agent you already have with
      [`POST /v1/agent/{code}/models`](#tag/Agent-Management/operation/addAgentModel)
      and download a generated artifact with
      [`GET /v1/agent/{code}/model/download`](#tag/Agent-Management/operation/downloadAgentModel).
      Family overview: [Essence 2 & Expression 2](https://docs.bithuman.ai/concepts/models-v2);
      per-model guides: [Expression 2](https://docs.bithuman.ai/concepts/expression-2)
      · [Essence 2 Quality](https://docs.bithuman.ai/concepts/essence-2-quality)
      · [Essence 2 Light](https://docs.bithuman.ai/concepts/essence-2-light).

  - name: pricing
    x-displayName: Pricing
    description: |
      ## Pricing & credits

      Free tier — **99 credits/month, no credit card**. Enough to
      try every endpoint and run a small avatar session.

      Paid plans start at **$20/month (Creator)**. Credits roll over
      within a billing period.
      [**See current pricing →**](https://www.bithuman.ai/pricing)

      ### What costs credits

      | Action | Cost |
      |---|---|
      | Agent generation — v1 models (`essence-1`, `expression-1`) | **250 credits** (one-time, per avatar) |
      | Agent generation — second generation (`essence-2` combined, `essence-2-quality`, `essence-2-light`, `expression-2`, `auto`) | **500 credits** (one-time, per avatar) |
      | Add a model to an existing agent (`POST /v1/agent/{code}/models`) | same per-model rates; adding `expression-1` is **free** (instant enablement) |
      | Live avatar session — Essence 1 (self-hosted) | **1 credit / minute** |
      | Live avatar session — Essence 1 (cloud) | **2 credits / minute** |
      | Live avatar session — Essence 2 Quality (self-hosted) | **4 credits / minute** |
      | Live avatar session — Essence 2 Quality (cloud) | **8 credits / minute** |
      | Live avatar session — Essence 2 Light (self-hosted) | **2 credits / minute** |
      | Live avatar session — Essence 2 Light (cloud) | **4 credits / minute** |
      | Live avatar session — Expression 1 (self-hosted) | **2 credits / minute** |
      | Live avatar session — Expression 1 (cloud) | **4 credits / minute** |
      | Live avatar session — Expression 2 (self-hosted) | **2 credits / minute** |
      | Live avatar session — Expression 2 (cloud) | **4 credits / minute** |
      | TTS / voice synthesis | **Currently free** within standard [rate limits](https://docs.bithuman.ai/api/rate-limits) |
      | File upload + dynamics generation | varies; see endpoint docs |

      ### What's free

      - On-device Swift voice chat **without** avatar — fully offline,
        unmetered.
      - Local mode brain (`BITHUMAN_LOCAL=1`) for everything except
        the API secret exchange itself.
      - Dashboard, Discord, docs.

      Check your current balance with
      [`GET /v2/credit-summaries`](#tag/Billing/operation/getCreditSummaries).

  ##########################################################################
  # Core concepts — the mental model
  ##########################################################################
  - name: avatars-imx
    x-displayName: Avatars & .imx files
    description: |
      ## Avatars and the `.imx` format

      Every bitHuman avatar is packaged as a single `.imx` file — a
      self-contained bundle of identity weights, textures, and metadata
      that the `libessence` engine reads to animate one specific face.

      ### Where `.imx` files come from

      | Source | How |
      |---|---|
      | **Showcase** | `bithuman pull <slug>` — pre-built avatars at [bithuman.ai → Explore](https://www.bithuman.ai/explore). |
      | **Dashboard** | Upload a portrait + voice samples at [bithuman.ai → Studio](https://www.bithuman.ai). |
      | **API** | [`POST /v1/agent/generate`](#tag/Agent-Generation/operation/generateAgent) returns an `agent_code` whose `.imx` you can download. |

      ### Where the file lives

      The `.imx` is keyed by **agent code** (e.g. `A78WKV4515`). Once
      generated, every SDK and the cloud runtime can resolve it by
      code — no need to ship the file with your app.

      ```bash
      # Cache locally for offline use
      bithuman pull modern-court-jester
      # → ~/.cache/bithuman/showcase/modern-court-jester.imx
      ```

      ### What's inside

      You don't have to understand it, but for the curious:

      - **Identity weights** — a small neural net specific to the face
      - **Reference frames** — texture atlases for the head
      - **Voice profile** — embedding for the cloned voice (Essence)
      - **Manifest** — model version, ABI, license, training metadata

      Inspect it with `bithuman info avatar.imx`.

      ### File format stability

      The `.imx` format is **forward-compatible within a major
      version**. The first time you open an older `.imx` with a newer
      runtime, the runtime warms it up and silently upgrades the file.
      Keep the runtime warm in production to avoid that cost
      per-session.

  - name: audio-streaming
    x-displayName: Audio streaming
    description: |
      ## The push/drain pattern

      Every SDK and the runtime use the same shape:

      1. **Push** 16-bit PCM audio chunks as they arrive (mic, TTS,
         WebRTC).
      2. **Drain** lip-synced video frames at 25 FPS.

      Audio in, video out — that's the entire surface area.

      ### Python — the minimal loop

      ```python
      import asyncio, os
      import numpy as np
      import soundfile as sf
      from bithuman import AsyncBithuman

      # bithuman 2.3 is library-only; inline the former bithuman.audio
      # helpers (the SDK resamples to 16 kHz internally).
      def load_audio(path):
          audio, sr = sf.read(path, dtype="float32", always_2d=False)
          if audio.ndim > 1:
              audio = audio.mean(axis=1)
          return audio, sr

      def float32_to_int16(arr):
          return (np.clip(arr, -1.0, 1.0) * 32767.0).astype(np.int16)

      async def main():
          rt = await AsyncBithuman.create(
              model_path="avatar.imx",
              api_secret=os.environ["BITHUMAN_API_SECRET"],
          )

          pcm, sr = load_audio("speech.wav")
          pcm = float32_to_int16(pcm)
          chunk = sr // 100                       # 10 ms chunks
          for i in range(0, len(pcm), chunk):
              await rt.push_audio(pcm[i:i+chunk].tobytes(), sr, last_chunk=False)
          await rt.flush()

          async for frame in rt.run():
              if frame.has_image:
                  image = frame.bgr_image         # numpy (H, W, 3) uint8
              if frame.end_of_speech:
                  break
          await rt.stop()

      asyncio.run(main())
      ```

      ### Audio format

      | Property | Value |
      |---|---|
      | Encoding | 16-bit signed PCM (`int16`) |
      | Channels | Mono |
      | Sample rate | Any (SDK auto-resamples) |
      | Chunk size | Anything; 10–40 ms is typical |

      ### Frame format

      Each yielded `frame` exposes:

      | Field | Type | What it is |
      |---|---|---|
      | `bgr_image` | `numpy.ndarray` (H, W, 3) `uint8` | The rendered video frame |
      | `audio_chunk` | `bytes` | Audio aligned with the frame |
      | `has_image` | `bool` | False for filler frames during silence |
      | `end_of_speech` | `bool` | True on the last frame of a turn |

      Frames arrive at **25 FPS** regardless of audio chunk size.

      ### When the avatar isn't speaking

      During silence, the runtime emits filler frames (`has_image=False`)
      so your render loop keeps its 25 FPS cadence. Skip them or render
      a static idle frame.

  - name: agent-lifecycle
    x-displayName: Agent lifecycle
    description: |
      ## The agent lifecycle

      From "I have a face and voice" to "live talking avatar":

      ```
      Generate → Store → Resolve → Live session → Speak
      ```

      ### 1. Generate

      [`POST /v1/agent/generate`](#tag/Agent-Generation/operation/generateAgent)
      with a prompt + image + voice sample. Returns `{agent_id, status:
      "processing"}` immediately — generation runs async.

      ### 2. Poll until ready

      [`GET /v1/agent/status/{agent_id}`](#tag/Agent-Generation/operation/getAgentStatus)
      — poll every 5 s. Status values:

      ```
      processing / generating / completed   (intermediate)
        → success / ready                   (terminal success)
        → failed                            (terminal error)
      ```

      `processing`, `generating`, and `completed` are **intermediate** — keep
      polling past them (don't stop on `completed`; it can appear early, even
      around ~5% `progress`) until you see `success`/`ready` or `failed`. Gate
      on `progress` reaching `1.0` together with a terminal status. Typical
      wall-clock: 2–5 minutes.

      ### 3. Resolve & stream

      Once the status is `success`/`ready`, the agent has an `agent_code`
      (e.g. `A78WKV4515`).
      Every SDK can resolve it by code:

      ```python
      rt = await AsyncBithuman.create(
          agent_code="A78WKV4515",  # not model_path — code resolves to .imx
          api_secret=os.environ["BITHUMAN_API_SECRET"],
      )
      ```

      ### 4. Drive the live session

      For a hosted LiveKit session, push text into the live room:

      ```bash
      curl -X POST https://api.bithuman.ai/v1/agent/A78WKV4515/speak \
        -H "api-secret: $BITHUMAN_API_SECRET" \
        -H "content-type: application/json" \
        -d '{"message": "Hello!"}'
      ```

      For silent knowledge injection (avatar doesn't say it aloud but
      uses it in future replies):

      ```bash
      curl -X POST https://api.bithuman.ai/v1/agent/A78WKV4515/add-context \
        -H "api-secret: $BITHUMAN_API_SECRET" \
        -H "content-type: application/json" \
        -d '{"context": "The customer is on the Pro plan."}'
      ```

      ### Updating an agent

      [`POST /v1/agent/{code}`](#tag/Agent-Management/operation/updateAgent)
      updates the system prompt only — for new face/voice, generate a
      new agent.

  ##########################################################################
  # SDK sections
  ##########################################################################
  - name: sdk-python
    x-displayName: Python SDK
    description: |
      ## Python SDK

      ```bash
      pip install bithuman --upgrade
      ```

      Python 3.10–3.14, pre-built wheels for macOS arm64 (macOS 26+) and
      Linux x86_64 / aarch64 (manylinux_2_28). Windows and macOS-Intel are not
      part of the 2.x distribution — pin the 1.x line if you need them. As of
      2.3.0 this wheel is the **Python SDK library only** (~5 MB):
      runtime + LiveKit plugin glue. The `bithuman` CLI binary ships
      separately — `pip install bithuman-cli` (the PyPI wheel is **macOS Apple
      Silicon only**) or `brew install bithuman-product/bithuman/bithuman-cli`;
      on Linux, install the CLI with the universal installer.

      ```python
      import asyncio, os
      from bithuman import AsyncBithuman

      async def main():
          rt = await AsyncBithuman.create(
              model_path="avatar.imx",
              api_secret=os.environ["BITHUMAN_API_SECRET"],
          )
          print(rt.frame_width, "x", rt.frame_height)
          await rt.stop()

      asyncio.run(main())
      ```

      [Full Python SDK reference →](https://github.com/bithuman-product/bithuman-sdk-public/blob/main/python/README.md)

  - name: sdk-swift
    x-displayName: Swift SDK
    description: |
      ## Swift SDK

      In Xcode: **File → Add Package Dependencies…** →
      `https://github.com/bithuman-product/bithuman-sdk-public.git` →
      pin to **0.8.1 or later** (current release **0.8.2**).

      ```swift
      // Package.swift
      .package(
          url: "https://github.com/bithuman-product/bithuman-sdk-public.git",
          from: "0.8.1"   // resolves to the latest 0.8.x tag (0.8.2 today)
      )
      ```

      ```swift
      import bitHumanKit

      let runtime = try await Bithuman.createRuntime(modelPath: modelURL)
      ```

      Binary XCFramework, zero transitive SwiftPM deps. Runs on:
      - macOS 26+ (Tahoe) on Apple Silicon M3+
      - iPad Pro M4+
      - iPhone 16 Pro+ (A18 Pro)

      Expression on Apple Silicon auto-spawns
      `bithuman-expression-daemon` as a subprocess. On unsupported
      hardware, `ExpressionModelNotSupported` is raised — not a crash.

      [Full Swift SDK reference →](https://github.com/bithuman-product/bithuman-sdk-public/blob/main/README.md)

  - name: sdk-flutter
    x-displayName: Flutter plugin
    description: |
      ## Flutter plugin

      ```yaml
      # pubspec.yaml
      dependencies:
        bithuman: ^1.16.0
      ```

      One Dart codebase across macOS and iOS, with built-in
      per-platform echo cancellation.

      Pub.dev publish is pending — request the git-dep URL on
      [Discord](https://discord.gg/ES953n7bPA).

  - name: cli
    x-displayName: CLI (bithuman)
    description: |
      ## The bitHuman CLI

      A standalone Rust binary that drives the full avatar engine
      without writing any code. Same engine as the SDKs, same `.imx`
      format.

      ### Install

      ```bash
      # macOS — Homebrew (canonical)
      brew install bithuman-product/bithuman/bithuman-cli

      # any OS — universal installer
      curl -fsSL https://raw.githubusercontent.com/bithuman-product/homebrew-bithuman/main/install.sh | sh

      # Python-only env (Linux x86_64/aarch64 + macOS arm64)
      pip install bithuman-cli
      ```

      ### Subcommands

      | Command | What it does |
      |---|---|
      | `bithuman list` | Browse showcase avatars available via `pull` |
      | `bithuman pull <slug>` | Download an avatar into `~/.cache/bithuman/showcase/` |
      | `bithuman info <path.imx>` | Inspect an `.imx` file's metadata |
      | `bithuman run <path.imx>` | Start the live avatar (mic in, browser viewer out) |
      | `bithuman render <path.imx> --audio x.wav --output y.mp4` | Offline lipsync — no browser, no brain |
      | `bithuman doctor` | Check credentials, brain selection, caches |
      | `bithuman --version` | Print engine + CLI versions |

      ### Live demo

      ```bash
      export BITHUMAN_API_SECRET=your_api_secret
      export OPENAI_API_KEY=sk-...                              # cloud brain
      bithuman pull modern-court-jester
      bithuman run ~/.cache/bithuman/showcase/modern-court-jester.imx
      # → open the printed http://127.0.0.1:8088/<CODE>, grant mic, talk
      ```

      ### On-device brain (no OpenAI key)

      ```bash
      pip install 'bithuman-cli[local]'
      BITHUMAN_LOCAL=1 bithuman run ~/.cache/bithuman/showcase/modern-court-jester.imx
      ```

      See [Local mode →](#tag/local-mode) for the on-device stack
      (whisper.cpp + llama.cpp + Supertonic + Silero VAD).

      <div class="bh-cards">
        <a href="#tag/quickstart" class="bh-card">
          <strong>Quickstart →</strong>
          <span>Full 2-minute walkthrough</span>
        </a>
        <a href="#tag/local-mode" class="bh-card">
          <strong>Local mode →</strong>
          <span>Run fully offline</span>
        </a>
      </div>

  ##########################################################################
  # Guides
  ##########################################################################
  - name: local-mode
    x-displayName: Local mode (on-device brain)
    description: |
      ## Local mode

      For private, no-cloud operation, install the `[local]` extra on
      the CLI package and set `BITHUMAN_LOCAL=1`. The conversation
      brain swaps from OpenAI Realtime to an entirely in-process
      stack — no API key, no outbound network.

      ```bash
      pip install 'bithuman-cli[local]'
      export BITHUMAN_API_SECRET=...
      bithuman pull modern-court-jester
      BITHUMAN_LOCAL=1 bithuman run ~/.cache/bithuman/showcase/modern-court-jester.imx
      ```

      The local stack ships:

      - **STT** — whisper.cpp (`tiny.en` by default, configurable)
      - **LLM** — llama.cpp + a GGUF model (`Qwen 2.5 0.5B-Instruct Q4_K_M` by default)
      - **TTS** — Supertonic (the same model bitHuman ships in
        production for `/v1/tts`)
      - **VAD** — Silero

      ~1.5 GB RAM peak, ~860 MB one-time HuggingFace download.

      ### Tuning

      - `BITHUMAN_LOCAL_LLM=<gguf-path>` — swap to a bigger / smaller
        model
      - `BITHUMAN_LOCAL_LANG=<iso2>` — pin transcript language
        (default `en`)
      - `BITHUMAN_LOCAL_WHISPER=<name>` — pick the whisper variant

      ### Troubleshooting

      **Is local mode actually wired up?** `bithuman doctor` verifies
      the `[local]` extras are importable and lists the resolved
      backend versions.

      **First run feels stuck.** It's downloading ~860 MB from
      HuggingFace. Watch network activity; subsequent runs start in
      &lt;1 s.

      **LLM is too dumb.** Bump to a bigger GGUF — see the table above.

      **TTS pronounces wrong language.** Set `BITHUMAN_LOCAL_LANG` to
      match your transcript language.

  - name: building-avatars
    x-displayName: Building avatars
    description: |
      ## Building avatars

      Two ways to get an avatar `.imx`:

      ### 1. Download a pre-built one

      Browse [Explore](https://www.bithuman.ai/explore), open the
      **⋮** menu on any agent → **Download**. Or via CLI:

      ```bash
      bithuman list                          # browse showcase avatars
      bithuman pull modern-court-jester      # cache locally
      ```

      ### 2. Generate from your own portrait

      Call [`POST /v1/agent/generate`](#tag/Agent-Generation/operation/generateAgent) with:

      - **Prompt** — system instructions for the agent's personality
      - **Portrait** — front-facing photo or video
      - **Voice samples** — 3–10 s of clean speech for voice cloning
      - (Optional) **Knowledge** — documents the agent should know about

      ```bash
      curl -X POST https://api.bithuman.ai/v1/agent/generate \
        -H "api-secret: $BITHUMAN_API_SECRET" \
        -H "content-type: application/json" \
        -d '{
          "prompt": "You are a friendly receptionist.",
          "image": "https://example.com/headshot.jpg",
          "audio": "https://example.com/voice.wav"
        }'
      ```

      The response includes an `agent_id` — once generation completes
      you can use it with [`POST /v1/agent/{code}/speak`](#tag/Agent-Context).

      ### Media tips

      - **Portrait**: front-facing, neutral expression, even lighting,
        eyes open, mouth closed. Avoid hats, glasses, hands in frame.
      - **Voice**: 3–10 seconds of clean speech, no background noise,
        no music, mono WAV preferred (16 kHz+).

  ##########################################################################
  # Deployment — three focused pages
  ##########################################################################
  - name: deploy-livekit
    x-displayName: Managed (LiveKit plugin)
    description: |
      ## Deploy via LiveKit plugin

      The fastest path to production. Drop the bitHuman avatar into any
      LiveKit agent worker. Managed runtime, ~5-minute setup.

      ### Install

      ```bash
      pip install livekit-plugins-bithuman
      ```

      ### Wire into an agent worker

      ```python
      import os
      from livekit.plugins import bithuman

      avatar = bithuman.AvatarSession(
          avatar_id="A78WKV4515",
          api_secret=os.environ["BITHUMAN_API_SECRET"],
      )
      ```

      Each session bills at the [self-hosted or cloud rate](#tag/pricing)
      depending on whether the avatar GPU is yours or ours.

      ### What you get

      - **Managed avatar runtime** — no GPU to provision, no Docker to
        operate
      - **LiveKit Cloud-compatible** — works with both LiveKit Cloud
        and self-hosted LiveKit servers
      - **WebRTC delivery** — video streamed via LiveKit's media
        pipeline to any client

      See the [LiveKit integration
      guide](https://docs.bithuman.ai/sdk/livekit) for the
      voice-agent integration recipe.

      <div class="bh-cards">
        <a href="#tag/deploy-self-hosted" class="bh-card">
          <strong>Self-hosted GPU →</strong>
          <span>Run on your own NVIDIA hardware</span>
        </a>
        <a href="#tag/deploy-embed" class="bh-card">
          <strong>Browser embed →</strong>
          <span>Drop an iframe on any page</span>
        </a>
      </div>

  - name: deploy-self-hosted
    x-displayName: Self-hosted GPU (Docker)
    description: |
      ## Self-hosted GPU deployment

      Run Expression on your own NVIDIA hardware. The Docker image is
      published on Docker Hub and ships everything baked in.

      ### Pull and run

      ```bash
      docker run --gpus all -p 8089:8089 \
        -v bithuman-models:/data/models \
        --tmpfs /tmp/bh-weights:size=9g,mode=0700 \
        --env-file ./bithuman.env \
        sgubithuman/expression-avatar:latest
      ```

      Point a LiveKit agent worker at `http://localhost:8089/launch` —
      the worker will spawn render sessions on demand.

      ### Hardware floor

      | GPU | Concurrent Essence sessions |
      |---|---|
      | RTX 3090 (24 GB) | 6–8 |
      | RTX 4090 (24 GB) | **8–10** (recommended) |
      | H100 (80 GB) | 30+ (overkill for most loads) |

      Expression sessions are heavier — budget ~3 GB VRAM per session.

      ### Billing

      Self-hosted GPU sessions bill at the **self-hosted rate** (1 cr/min
      Essence, 2 cr/min Expression). See [pricing](#tag/pricing) for the
      cloud-vs-self-hosted breakdown.

  - name: deploy-embed
    x-displayName: Browser embed
    description: |
      ## Browser embed

      Embed an agent as an iframe — no SDK install, drop it on any page:

      ```html
      <iframe
        src="https://agent.viewer.bithuman.ai/api/embed/A78WKV4515"
        allow="microphone; camera; autoplay"
        style="width: 100%; height: 600px; border: 0;"
      ></iframe>
      ```

      Replace `A78WKV4515` with your agent code.

      ### Production: mint short-lived tokens

      For per-visitor session tracking and rate limiting, mint a
      short-lived embed token from your backend and append it as a
      query string:

      ```js
      // server: mint token (api-secret never reaches the browser)
      const res = await fetch("https://api.bithuman.ai/v1/embed-tokens/request", {
        method: "POST",
        headers: {
          "api-secret": process.env.BITHUMAN_API_SECRET,
          "content-type": "application/json",
        },
        body: JSON.stringify({
          agent_id: "A78WKV4515",
          fingerprint: visitorFingerprint,  // stable per-device hex
        }),
      });
      const { data: { token } } = await res.json();

      // browser: pass token in the iframe URL
      // <iframe src="https://agent.viewer.bithuman.ai/api/embed/A78WKV4515?token={token}" .../>
      ```

      The token is a **1-hour JWT**. Mint per visitor session. See
      [POST /v1/embed-tokens/request](#tag/Embedding/operation/requestEmbedToken)
      for the full request shape.

  ##########################################################################
  # Community
  ##########################################################################
  - name: community
    x-displayName: Community
    description: |
      ## Community

      - **GitHub** —
        [bithuman-product/bithuman-sdk-public](https://github.com/bithuman-product/bithuman-sdk-public)
        for issues, examples, and Swift SDK source distribution.
      - **Discord** —
        [discord.gg/ES953n7bPA](https://discord.gg/ES953n7bPA) — questions,
        showcases, and roadmap discussions.
      - **Twitter / X** —
        [@bithuman_ai](https://twitter.com/bithuman_ai) — release
        announcements and demos.

      ### Contributing

      We accept community PRs for SDK examples, integration plugins
      (LiveKit, Pipecat, Vapi, etc.), and docs. See
      [CONTRIBUTING.md](https://github.com/bithuman-product/bithuman-sdk-public/blob/main/CONTRIBUTING.md)
      for the RFC process.

      <div class="bh-cards">
        <a href="https://discord.gg/ES953n7bPA" class="bh-card">
          <strong>Join Discord →</strong>
          <span>Talk to the team and community</span>
        </a>
        <a href="https://github.com/bithuman-product/bithuman-sdk-public" class="bh-card">
          <strong>GitHub →</strong>
          <span>Issues, examples, SDK source</span>
        </a>
      </div>

  - name: changelog
    x-displayName: Changelog & releases
    description: |
      ## Changelog & releases

      - **SDK release notes** —
        [github.com/bithuman-product/bithuman-sdk-public/releases](https://github.com/bithuman-product/bithuman-sdk-public/releases)
        for tagged releases across Python, Swift, and Flutter.
      - **Downloads page** —
        [releases.bithuman.ai](https://releases.bithuman.ai) for
        pre-built wheels, the platform support matrix, and the
        device-by-device compatibility table.
      - **API changelog** — breaking changes to the REST API are
        announced on Discord and in the GitHub repo's CHANGELOG. The
        REST API itself is versioned via `/v1/...` / `/v2/...` paths;
        new endpoints land additively without forcing migrations.

      ### Versioning policy

      | Surface | Versioning |
      |---|---|
      | REST API | URL path (`/v1/`, `/v2/`) — never breaks at minor |
      | Python SDK | Semver — minor bumps may add params; major bumps may rename |
      | Swift SDK | Semver — floor `0.8.1` per Package.swift (current release `0.8.2`) |
      | Flutter SDK | Semver — pinned in `pubspec.yaml` |
      | `.imx` file format | Forward-compatible within a major; auto-upgraded on first load |

  - name: support
    x-displayName: Support & status
    description: |
      ## Support & status

      ### Status

      [**status.bithuman.ai**](https://status.bithuman.ai) — live
      status of `api.bithuman.ai`, the GPU avatar pool, TTS, agent
      generation, and the dashboard.

      ### Where to get help

      | I want to… | Go here |
      |---|---|
      | Report a bug | [GitHub issues](https://github.com/bithuman-product/bithuman-sdk-public/issues) |
      | Ask a question | [Discord #help](https://discord.gg/ES953n7bPA) |
      | Request a feature | [Discord #ideas](https://discord.gg/ES953n7bPA) |
      | Email support | hello@bithuman.ai |
      | Security disclosure | hello@bithuman.ai (see [SECURITY.md](https://github.com/bithuman-product/bithuman-sdk-public/blob/main/SECURITY.md)) |

      ### Service status

      All of api.bithuman.ai runs behind Cloudflare with multiple
      origins for HA. When a regional issue surfaces, the status page
      is the first place we publish.

  ##########################################################################
  # API tags (these have actual operations attached)
  ##########################################################################
  - name: Authentication
    description: Verify API credentials with `POST /v1/validate`.
  - name: Voice
    description: |
      Text-to-speech synthesis in 30+ languages with 10 built-in
      voices, plus an OpenAI-compatible drop-in (`/v1/audio/speech`)
      and a voice-tuning preview surface.

      **Try it live:** [www.bithuman.ai/voice](https://www.bithuman.ai/voice).
  - name: Agent Generation
    description: Create new avatar agents from prompts, portraits, and voice samples.
  - name: Agent Management
    description: Retrieve and update existing agents.
  - name: Agent Context
    description: Control live avatar sessions — make them speak or inject knowledge.
  - name: Files
    description: Upload images, videos, audio, and documents.
  - name: Billing
    description: Check credit balance, plan details, and usage history.
  - name: Account
    description: Identity, plan, credits, and pricing.
  - name: Webhooks
    description: |
      Register HTTPS endpoints to receive signed event notifications when
      async work finishes (today: `agent.ready`, `agent.failed`). Each
      delivery carries an `X-BitHuman-Signature: sha256=<hmac>` header — verify
      it with the secret returned once at creation.
  - name: Embedding
    description: Mint short-lived tokens for website avatar embeds.
  - name: Dynamics
    description: |
      Generate and manage conversational gesture animations (wave,
      nod, laugh, idle motions) for an avatar. Generate them
      asynchronously, then toggle on/off to make them the active model
      for live sessions. Gestures trigger based on keyword mapping
      during conversation.
