Meta推AI聊天機器人新功能:主動發送消息提升互動體驗
Abacus.AI重磅推出DeepAgent,全能AI助手引領企業智能化轉型
大模型時代,通用視覺模型將何去何從?
X平台試點AI生成“社區筆記”,Grok接入信息核查流程
Gemini可以分析和理解音頻輸入,從而實現以下用例:
1. 描述、總結或回答與音頻內容相關的問題。
2. 提供音頻轉寫內容。
3. 分析音頻的特定片段。
在調用Gemini API 之前,請確保您已安裝所選的SDK,並已配置好Gemini API 密鑰,可以使用。
您可以通過以下方式向Gemini 提供音頻數據:
先上傳音頻文件,然後再向generateContent 發出請求。
通過請求傳遞內嵌音頻數據到generateContent。
您可以使用Files API 上傳音頻文件。如果請求總大小(包括文件、文本提示、系統說明等)超過20 MB,請始終使用Files API。
以下代碼會上傳音頻文件,然後在對generateContent 的調用中使用該文件。
from google import genai client = genai.Client(api_key="GOOGLE_API_KEY") myfile = client.files.upload(file="path/to/sample.mp3") response = client.models.generate_content( model="gemini-2.0-flash", contents=["Describe this audio clip", myfile] ) print(response.text)
import {
GoogleGenAI,
createUserContent,
createPartFromUri,
} from "@google/genai";
const ai = new GoogleGenAI({ apiKey: "GOOGLE_API_KEY" });
async function main() {
const myfile = await ai.files.upload({
file: "path/to/sample.mp3",
config: { mimeType: "audio/mp3" },
});
const response = await ai.models.generateContent({
model: "gemini-2.0-flash",
contents: createUserContent([
createPartFromUri(myfile.uri, myfile.mimeType),
"Describe this audio clip",
]),
});
console.log(response.text);
}
await main(); file, err := client.UploadFileFromPath(ctx, "path/to/sample.mp3", nil)
if err != nil {
log.Fatal(err)
}
defer client.DeleteFile(ctx, file.Name)
model := client.GenerativeModel("gemini-2.0-flash")
resp, err := model.GenerateContent(ctx,
genai.FileData{URI: file.URI},
genai.Text("Describe this audio clip"))
if err != nil {
log.Fatal(err)
}
printResponse(resp) AUDIO_PATH="path/to/sample.mp3"
MIME_TYPE=$(file -b --mime-type "${AUDIO_PATH}")
NUM_BYTES=$(wc -c < "${AUDIO_PATH}")
DISPLAY_NAME=AUDIO
tmp_header_file=upload-header.tmp
# Initial resumable request defining metadata.
# The upload url is in the response headers dump them to a file.
curl "https://generativelanguage.googleapis.com/upload/v1beta/files?key=${GOOGLE_API_KEY}"
-D upload-header.tmp
-H "X-Goog-Upload-Protocol: resumable"
-H "X-Goog-Upload-Command: start"
-H "X-Goog-Upload-Header-Content-Length: ${NUM_BYTES}"
-H "X-Goog-Upload-Header-Content-Type: ${MIME_TYPE}"
-H "Content-Type: application/json"
-d "{'file': {'display_name': '${DISPLAY_NAME}'}}" 2> /dev/null
upload_url=$(grep -i "x-goog-upload-url: " "${tmp_header_file}" | cut -d" " -f2 | tr -d "r")
rm "${tmp_header_file}"
# Upload the actual bytes.
curl "${upload_url}"
-H "Content-Length: ${NUM_BYTES}"
-H "X-Goog-Upload-Offset: 0"
-H "X-Goog-Upload-Command: upload, finalize"
--data-binary "@${AUDIO_PATH}" 2> /dev/null > file_info.json
file_uri=$(jq ".file.uri" file_info.json)
echo file_uri=$file_uri
# Now generate content using that file
curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=$GOOGLE_API_KEY"
-H 'Content-Type: application/json'
-X POST
-d '{
"contents": [{
"parts":[
{"text": "Describe this audio clip"},
{"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": '$file_uri'}}]
}]
}' 2> /dev/null > response.json
cat response.json
echo
jq ".candidates[].content.parts[].text" response.json您可以將請求中的內嵌音頻數據傳遞給generateContent,而不是上傳音頻文件:
from google.genai import types
with open('path/to/small-sample.mp3', 'rb') as f:
audio_bytes = f.read()
response = client.models.generate_content(
model='gemini-2.0-flash',
contents=[
'Describe this audio clip',
types.Part.from_bytes(
data=audio_bytes,
mime_type='audio/mp3',
)
]
)
print(response.text) import { GoogleGenAI } from "@google/genai";
import * as fs from "node:fs";
const ai = new GoogleGenAI({ apiKey: "GOOGLE_API_KEY" });
const base64AudioFile = fs.readFileSync("path/to/small-sample.mp3", {
encoding: "base64",
});
const contents = [
{ text: "Please summarize the audio." },
{
inlineData: {
mimeType: "audio/mp3",
data: base64AudioFile,
},
},
];
const response = await ai.models.generateContent({
model: "gemini-2.0-flash",
contents: contents,
});
console.log(response.text); // Initialize a Gemini model appropriate for your use case.
model := client.GenerativeModel("gemini-2.0-flash")
bytes, err := os.ReadFile("path/to/small-sample.mp3")
if err != nil {
log.Fatal(err)
}
prompt := []genai.Part{
genai.Blob{MIMEType: "audio/mp3", Data: bytes},
genai.Text("Please summarize the audio."),
}
// Generate content using the prompt.
resp, err := model.GenerateContent(ctx, prompt...)
if err != nil {
log.Fatal(err)
}
// Handle the response of generated text
for _, c := range resp.Candidates {
if c.Content != nil {
fmt.Println(*c.Content)
}
}關於內嵌音頻數據,請注意以下幾點:
1. 請求大小上限為20 MB,其中包括文本提示、系統說明和內嵌的文件。如果文件大小會導致請求總大小超過20 MB,請使用Files API 上傳音頻文件以在請求中使用。
2. 如果您要多次使用音頻選段,則更高效的方式是上傳音頻文件。
如需獲取音頻數據的轉寫內容,只需在提示中提出請求即可:
myfile = client.files.upload(file='path/to/sample.mp3') prompt = 'Generate a transcript of the speech.' response = client.models.generate_content( model='gemini-2.0-flash', contents=[prompt, myfile] ) print(response.text)
import {
GoogleGenAI,
createUserContent,
createPartFromUri,
} from "@google/genai";
const ai = new GoogleGenAI({ apiKey: "GOOGLE_API_KEY" });
const myfile = await ai.files.upload({
file: "path/to/sample.mp3",
config: { mimeType: "audio/mpeg" },
});
const result = await ai.models.generateContent({
model: "gemini-2.0-flash",
contents: createUserContent([
createPartFromUri(myfile.uri, myfile.mimeType),
"Generate a transcript of the speech.",
]),
});
console.log("result.text=", result.text); // Initialize a Gemini model appropriate for your use case.
model := client.GenerativeModel("gemini-2.0-flash")
// Create a prompt using text and the URI reference for the uploaded file.
prompt := []genai.Part{
genai.FileData{URI: sampleAudio.URI},
genai.Text("Generate a transcript of the speech."),
}
// Generate content using the prompt.
resp, err := model.GenerateContent(ctx, prompt...)
if err != nil {
log.Fatal(err)
}
// Handle the response of generated text
for _, c := range resp.Candidates {
if c.Content != nil {
fmt.Println(*c.Content)
}
}您可以使用形式為MM:SS 的時間戳來引用音頻文件的特定部分。例如,以下提示會請求轉寫內容,
1. 從文件開頭算起,開始時間為2 分30 秒。
2. 從文件開頭算起,結束時間為3 分29 秒。
# Create a prompt containing timestamps. prompt = "Provide a transcript of the speech from 02:30 to 03:29."
// Create a prompt containing timestamps. const prompt = "Provide a transcript of the speech from 02:30 to 03:29."
// Create a prompt containing timestamps.
prompt := []genai.Part{
genai.FileData{URI: sampleAudio.URI},
genai.Text("Provide a transcript of the speech from 02:30 to 03:29."),
}調用countTokens 方法可獲取音頻文件中的令牌數量。例如:
response = client.models.count_tokens( model='gemini-2.0-flash', contents=[myfile] ) print(response)
import {
GoogleGenAI,
createUserContent,
createPartFromUri,
} from "@google/genai";
const ai = new GoogleGenAI({ apiKey: "GOOGLE_API_KEY" });
const myfile = await ai.files.upload({
file: "path/to/sample.mp3",
config: { mimeType: "audio/mpeg" },
});
const countTokensResponse = await ai.models.countTokens({
model: "gemini-2.0-flash",
contents: createUserContent([
createPartFromUri(myfile.uri, myfile.mimeType),
]),
});
console.log(countTokensResponse.totalTokens); tokens, err := model.CountTokens(ctx, genai.FileData{URI: sampleAudio.URI})
if err != nil {
log.Fatal(err)
}
fmt.Printf("File %s is %d tokens", sampleAudio.DisplayName, tokens.TotalTokens)Gemini 支持以下音頻格式MIME 類型:
1. WAV - audio/wav
2. MP3 - audio/mp3
3. AIFF - audio/aiff
4. AAC - audio/aac
5. OGG Vorbis - audio/ogg
6. FLAC - audio/flac
1. Gemini 將每秒的音頻表示為32 個令牌;例如,一分鐘的音頻表示為1,920 個令牌。
2. Gemini 只能推斷對英語語音的回答。
3. Gemini可以“理解”非語音內容,例如鳥鳴或警笛。
4. 單個問題中音頻數據的支持時長上限為9.5 小時。 Gemini 不限制單個問題中的音頻文件數量;不過,單個問題中的所有音頻文件總時長不得超過9.5 小時。
5. Gemini 會將音頻文件下採樣為16 Kbps 的數據分辨率。
6. 如果音頻源包含多個聲道,Gemini 會將這些聲道合併為一個聲道。