> ## Documentation Index
> Fetch the complete documentation index at: https://docs.siliconflow.com/llms.txt
> Use this file to discover all available pages before exploring further.

# 创建文本转语音请求

> Generate audio from input text. The data generated by the interface is the binary data of the audio, which requires the user to handle it themselves. Reference:https://docs.siliconflow.com/capabilities/text-to-speech#5


## OpenAPI

````yaml post /audio/speech
openapi: 3.0.0
info:
  title: SiliconFlow API
  description: The SiliconFlow REST API
  version: 1.0.0
  contact:
    name: SiliconFlow Support
    url: https://www.siliconflow.com/
  license:
    name: MIT
    url: https://github.com/siliconflow-inc/siliconflow-api/blob/main/LICENSE
servers:
  - url: https://api.siliconflow.com/v1
security:
  - bearerAuth: []
paths:
  /audio/speech:
    post:
      tags:
        - Audio
      summary: Create Speech
      description: >-
        Generate audio from input text. The data generated by the interface is
        the binary data of the audio, which requires the user to handle it
        themselves.
        Reference:https://docs.siliconflow.com/capabilities/text-to-speech#5
      operationId: createSpeech
      requestBody:
        required: true
        content:
          application/json:
            schema:
              oneOf:
                - $ref: '#/components/schemas/fish-speech-1.5'
                - $ref: '#/components/schemas/CosyVoice2-0.5B'
      responses:
        '200':
          description: >-
            Generate audio based on the input text. The data generated by the
            interface is in binary format and requires the user to process it
            themselves.
            Reference:https://docs.siliconflow.com/capabilities/text-to-speech#5
          headers:
            Transfer-Encoding:
              schema:
                type: string
              description: chunked
          content:
            application/audio:
              schema:
                type: string
                format: binary
                example: Audio binary data
            audio/wav:
              schema:
                type: string
                format: binary
                example: Audio binary data
            audio/opus:
              schema:
                type: string
                format: binary
                example: Audio binary data
        '400':
          description: BadRequest
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/BadRquestData'
                type: object
        '401':
          description: Unauthorized
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/UnauthorizedData'
        '404':
          description: NotFound
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/NotFoundData'
        '429':
          description: RateLimit
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/RateLimitData'
        '503':
          description: Overloaded
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/OverloadedtData'
        '504':
          description: Timeout
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/TimeoutData'
components:
  schemas:
    fish-speech-1.5:
      title: fish-speech-1.5
      type: object
      required:
        - model
        - input
        - voice
      additionalProperties: false
      properties:
        model:
          type: string
          enum:
            - fishaudio/fish-speech-1.5
          description: >-
            Corresponding Model Name. To better enhance service quality, we will
            make periodic changes to the models provided by this service,
            including but not limited to model on/offlining and adjustments to
            model service capabilities. We will notify you of such changes
            through appropriate means such as announcements or message pushes
            where feasible.
        input:
          type: string
          description: The text to generate audio for.
          example: The text to generate audio for
          maxLength: 128000
          minLength: 1
        voice:
          type: string
          enum:
            - fishaudio/fish-speech-1.5:alex
            - fishaudio/fish-speech-1.5:anna
            - fishaudio/fish-speech-1.5:bella
            - fishaudio/fish-speech-1.5:benjamin
            - fishaudio/fish-speech-1.5:charles
            - fishaudio/fish-speech-1.5:claire
            - fishaudio/fish-speech-1.5:david
            - fishaudio/fish-speech-1.5:diana
        response_format:
          description: >-
            The format to audio out. Supported formats are `mp3`, `opus`, `wav`,
            `pcm`
          default: mp3
          type: string
          enum:
            - mp3
            - opus
            - wav
            - pcm
        sample_rate:
          description: >-
            Control the output sample rate. The default values and differ for
            different video output types, as follows: opus: Supports 48000 Hz.
            wav, pcm: Supports 8000, 16000, 24000, 32000, 44100 Hz, with a
            default of 44100 Hz. mp3: Supports 32000, 44100 Hz, with a default
            of 44100 Hz.
          type: number
          example: 32000
          enum:
            - 8000
            - 16000
            - 24000
            - 32000
            - 44100
            - 48000
        stream:
          description: streaming or not
          type: boolean
          default: true
        speed:
          type: number
          description: >-
            The speed of the generated audio. Select a value from `0.25` to
            `4.0`. `1.0` is the default.
          format: float
          minimum: 0.25
          maximum: 4
          default: 1
        gain:
          type: number
          format: float
          minimum: -10
          maximum: 10
          default: 0
    CosyVoice2-0.5B:
      title: CosyVoice2-0.5B
      type: object
      required:
        - model
        - input
      additionalProperties: false
      properties:
        model:
          type: string
          enum:
            - FunAudioLLM/CosyVoice2-0.5B
          description: >-
            Corresponding Model Name. To better enhance service quality, we will
            make periodic changes to the models provided by this service,
            including but not limited to model on/offlining and adjustments to
            model service capabilities. We will notify you of such changes
            through appropriate means such as announcements or message pushes
            where feasible.
        input:
          type: string
          description: >-
            For natural language instructions, add a special end marker
            "<|endofprompt|>" before the natural language description. These
            descriptions cover aspects such as emotion, speaking speed,
            role-playing, and dialects. For detailed instructions, insert pitch
            bursts between text markers, using markers like "[laughter]" and
            "[breath]." Additionally, we apply pitch feature markers to phrases;
            for example:Can you say it with a happy emotion? <|endofprompt|>
            Today is really happy, Spring Festival is coming! I’m so happy,
            Spring Festival is coming! [laughter] [breath].
          example: >-
            Can you say it with a happy emotion? <|endofprompt|>I'm so happy,
            Spring Festival is coming!
          default: >-
            Can you say it with a happy emotion? <|endofprompt|>I'm so happy,
            Spring Festival is coming!
          maxLength: 128000
          minLength: 1
        voice:
          type: string
          enum:
            - FunAudioLLM/CosyVoice2-0.5B:alex
            - FunAudioLLM/CosyVoice2-0.5B:anna
            - FunAudioLLM/CosyVoice2-0.5B:bella
            - FunAudioLLM/CosyVoice2-0.5B:benjamin
            - FunAudioLLM/CosyVoice2-0.5B:charles
            - FunAudioLLM/CosyVoice2-0.5B:claire
            - FunAudioLLM/CosyVoice2-0.5B:david
            - FunAudioLLM/CosyVoice2-0.5B:diana
        references:
          description: The voice field and references field are mutually exclusive.
          type: array
          items:
            type: object
            properties:
              audio:
                oneOf:
                  - type: string
                    format: uri
                    description: >-
                      A URL pointing to an audio file (e.g.,
                      `https://example.com/audio.mp3`).
                  - type: string
                    pattern: ^data:audio\/\w+;base64,[A-Za-z0-9+/=]+$
                    description: >-
                      A base64-encoded audio string (e.g.,
                      `data:audio/mp3;base64,ABC123...`).
              text:
                description: >-
                  The audio content, which can be either a URL pointing to an
                  audio file or a base64-encoded audio string.
                type: string
        response_format:
          description: >-
            The format to audio out. Supported formats are `mp3`, `opus`, `wav`,
            `pcm`
          default: mp3
          type: string
          enum:
            - mp3
            - opus
            - wav
            - pcm
        sample_rate:
          description: >-
            Control the output sample rate. The default values and differ for
            different video output types, as follows: opus: Supports 48000 Hz.
            wav, pcm: Supports 8000, 16000, 24000, 32000, 44100 Hz, with a
            default of 44100 Hz. mp3: Supports 32000, 44100 Hz, with a default
            of 44100 Hz.
          type: number
          default: 32000
        stream:
          description: streaming or not
          type: boolean
        speed:
          type: number
          description: >-
            The speed of the generated audio. Select a value from `0.25` to
            `4.0`. `1.0` is the default.
          format: float
          minimum: 0.25
          maximum: 4
          default: 1
        gain:
          type: number
          format: float
          minimum: -10
          maximum: 10
          default: 0
    BadRquestData:
      type: object
      required:
        - message
        - data
        - code
      properties:
        code:
          type: integer
          nullable: true
          default: false
          example: 20012
        message:
          type: string
          nullable: false
        data:
          type: string
          nullable: false
    UnauthorizedData:
      type: string
      default: false
      example: Invalid token
    NotFoundData:
      type: string
      default: false
      example: 404 page not found
    RateLimitData:
      type: object
      required:
        - message
        - data
      properties:
        message:
          type: string
          example: >-
            Request was rejected due to rate limiting. If you want more, please
            contact contact@siliconflow.com. Details:TPM limit reached.
        data:
          type: string
    OverloadedtData:
      type: object
      required:
        - code
        - message
        - data
      properties:
        code:
          type: integer
          example: 50505
        message:
          type: string
          example: Model service overloaded. Please try again later.
        data:
          type: string
          nullable: false
    TimeoutData:
      type: string
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      bearerFormat: your api key
      description: >-
        Use the following format for authentication: Bearer [<your api
        key>](https://cloud.siliconflow.com/account/ak)

````