Manus Invitation Code Application Guide
Character.AI launches AvatarFX: AI video generation model allows static images to "open to speak"
Manychat completes US$140 million Series B financing, using AI to accelerate global social e-commerce layout
Google AI Overview Severely Impacts SEO Click-through Rate: Ahrefs Research shows traffic drop by more than 34%
Gemini models can process images, thus supporting many advanced developer use cases that used to use domain-specific models. Some of Gemini's visual features include:
1. Add text descriptions to the picture and answer questions related to the picture
2. Translate and infer PDF files (with up to 2 million tokens)
3. Detect the object in the picture and return its bounding box coordinates
4. Split the objects in the picture
Gemini has multimodal properties from the outset and we will continue to push possible boundaries.
Before calling the Gemini API, make sure that you have installed the selected SDK and have the Gemini API key configured for use.
You can provide the image as input to Gemini by:
1. Use the File API to upload the image file and then issue a request to generateContent. Use this method for files larger than 20MB, or when you want to reuse files in multiple requests.
2. Pass the embedded image data to generateContent through the request. Please use this method for smaller files (total request size less than 20MB) or images extracted directly from the URL.
You can use the Files API to upload image files. Always use the Files API if the total request size (including files, text prompts, system instructions, etc.) exceeds 20 MB, or if you intend to use the same image in multiple prompts.
The following code uploads the image file and then uses it when calling generateContent.
from google import genai client = genai.Client(api_key="GOOGLE_API_KEY") my_file = client.files.upload(file="path/to/sample.jpg") response = client.models.generate_content( model="gemini-2.0-flash", contents=[my_file, "Caption this image."], ) print(response.text)
import { GoogleGenAI, createUserContent, createPartFromUri, } from "@google/genai"; const ai = new GoogleGenAI({ apiKey: "GOOGLE_API_KEY" }); async function main() { const myfile = await ai.files.upload({ file: "path/to/sample.jpg", config: { mimeType: "image/jpeg" }, }); const response = await ai.models.generateContent({ model: "gemini-2.0-flash", contents: createUserContent([ createPartFromUri(myfile.uri, myfile.mimeType), "Caption this image.", ]), }); console.log(response.text); } await main();
file, err := client.UploadFileFromPath(ctx, "path/to/sample.jpg", nil) if err != nil { log.Fatal(err) } defer client.DeleteFile(ctx, file.Name) model := client.GeneratedModel("gemini-2.0-flash") resp, err := model.GenerateContent(ctx, genai.FileData{URI: file.URI}, genai.Text("Caption this image.")) if err != nil { log.Fatal(err) } printResponse(resp)
IMAGE_PATH="path/to/sample.jpg" MIME_TYPE=$(file -b --mime-type "${IMAGE_PATH}") NUM_BYTES=$(wc -c < "${IMAGE_PATH}") DISPLAY_NAME=IMAGE tmp_header_file=upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files?key=${GOOGLE_API_KEY}" -D upload-header.tmp -H "X-Goog-Upload-Protocol: resumable" -H "X-Goog-Upload-Command: start" -H "X-Goog-Upload-Header-Content-Length: ${NUM_BYTES}" -H "X-Goog-Upload-Header-Content-Type: ${MIME_TYPE}" -H "Content-Type: application/json" -d "{'file': {'display_name': '${DISPLAY_NAME}'}}" 2> /dev/null upload_url=$(grep -i "x-goog-upload-url: " "${tmp_header_file}" | cut -d" " -f2 | tr -d "r") rm "${tmp_header_file}" # Upload the actual bytes. curl "${upload_url}" -H "Content-Length: ${NUM_BYTES}" -H "X-Goog-Upload-Offset: 0" -H "X-Goog-Upload-Command: upload, finalize" --data-binary "@${IMAGE_PATH}" 2> /dev/null> file_info.json file_uri=$(jq ".file.uri" file_info.json) echo file_uri=$file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=$GOOGLE_API_KEY" -H 'Content-Type: application/json' -X POST -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": '$file_uri'}}, {"text": "Caption this image."}] }] }' 2> /dev/null> response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json
For more information on how to process media files, see the Files API.
You can pass the inline image data in the request to generateContent without uploading the image file. This works for smaller images (total request size is less than 20MB) or images extracted directly from the URL.
You can provide image data as Base64 encoded strings or read local files directly (depending on the SDK).
Local image files:
from google.genai import types with open('path/to/small-sample.jpg', 'rb') as f: image_bytes = f.read() response = client.models.generate_content( model='gemini-2.0-flash', contents=[ types.Part.from_bytes( data=image_bytes, mime_type='image/jpeg', ), 'Caption this image.' ] ) print(response.text)
from google.genai import types import { GoogleGenAI } from "@google/genai"; import * as fs from "node:fs"; const ai = new GoogleGenAI({ apiKey: "GOOGLE_API_KEY" }); const base64ImageFile = fs.readFileSync("path/to/small-sample.jpg", { encoding: "base64", }); const contents = [ { inlineData: { mimeType: "image/jpeg", data: base64ImageFile, }, }, { text: "Caption this image." }, ]; const response = await ai.models.generateContent({ model: "gemini-2.0-flash", contents: contents, }); console.log(response.text);
from google.genai import types model := client.GeneratedModel("gemini-2.0-flash") bytes, err := os.ReadFile("path/to/small-sample.jpg") if err != nil { log.Fatal(err) } prompt := []genai.Part{ genai.Blob{MIMEType: "image/jpeg", Data: bytes}, genai.Text("Caption this image."), } resp, err := model.GenerateContent(ctx, prompt...) if err != nil { log.Fatal(err) } for _, c := range resp.Candidates { if c.Content != nil { fmt.Println(*c.Content) } }
from google.genai import types IMG_PATH=/path/to/your/image1.jpg if [[ "$(base64 --version 2>&1)" = *"FreeBSD"* ]]; then B64FLAGS="--input" else B64FLAGS="-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=$GOOGLE_API_KEY" -H 'Content-Type: application/json' -X POST -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "'$(base64 $B64FLAGS $IMG_PATH)'" } }, {"text": "Caption this image."}, ] }] }' 2> /dev/null
Images added via URL:
from google.genai import types from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests.get(image_path).content image = types.Part.from_bytes( data=image_bytes, mime_type="image/jpeg" ) client = genai.Client(api_key="GOOGLE_API_KEY") response = client.models.generate_content( model="gemini-2.0-flash-exp", contents=["What is this image?", image], ) print(response.text)
from google.genai import types import { GoogleGenAI } from "@google/genai"; async function main() { const ai = new GoogleGenAI({ apiKey: process.env.GOOGLE_API_KEY }); const imageUrl = "https://goo.gle/instrument-img"; const response = await fetch(imageUrl); const imageArrayBuffer = await response.arrayBuffer(); const base64ImageData = Buffer.from(imageArrayBuffer).toString('base64'); const result = await ai.models.generateContent({ model: "gemini-2.0-flash", contents: [ { inlineData: { mimeType: 'image/jpeg', data: base64ImageData, }, }, { text: "Caption this image." } ], }); console.log(result.text); } main();
from google.genai import types func main() { ctx := context.Background() client, err := genai.NewClient(ctx, option.WithAPIKey(os.Getenv("GOOGLE_API_KEY"))) if err != nil { log.Fatal(err) } defer client.Close() model := client.GeneratedModel("gemini-2.0-flash") // Download the image. imageResp, err := http.Get("https://goo.gle/instrument-img") if err != nil { panic(err) } defer imageResp.Body.Close() imageBytes, err := io.ReadAll(imageResp.Body) if err != nil { panic(err) } // Create the request. req := []genai.Part{ genai.ImageData("jpeg", imageBytes), genai.Text("Caption this image."), } // Generate content. resp, err := model.GenerateContent(ctx, req...) if err != nil { panic(err) } // Handle the response of generated text. for _, c := range resp.Candidates { if c.Content != nil { fmt.Println(*c.Content) } } }
from google.genai import types IMG_URL="https://goo.gle/instrument-img" MIME_TYPE=$(curl -sIL "$IMG_URL" | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/r$//' | head -n 1) if [[ -z "$MIME_TYPE" || ! "$MIME_TYPE" == image/* ]]; then MIME_TYPE="image/jpeg" fi if [[ "$(base64 --version 2>&1)" = *"FreeBSD"* ]]; then B64FLAGS="--input" else B64FLAGS="-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=$GOOGLE_API_KEY" -H 'Content-Type: application/json' -X POST -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"'"$MIME_TYPE"'", "data": "'$(curl -sL "$IMG_URL" | base64 $B64FLAGS)'" } }, {"text": "Caption this image."} ] }] }' 2> /dev/null
Regarding embedded image data, please pay attention to the following points:
The total size of the request is capped at 20 MB, which includes text prompts, system descriptions, and all files provided inline. If the file size causes the total request size to exceed 20 MB, use the Files API to upload the image file for use in the request.
If you are using the image example multiple times, uploading image files using the File API is more efficient.
You can provide multiple images in a single question by adding multiple image Part objects in the contents array. This data can be a mixture of embedded data (local files or URLs) and File API references.
from google.genai import types from google import genai from google.genai import types client = genai.Client(api_key="GOOGLE_API_KEY") # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client.files.upload(file=image1_path) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open(image2_path, 'rb') as f: img2_bytes = f.read() # Create the prompt with text and multiple images response = client.models.generate_content( model="gemini-2.0-flash", contents=[ "What is different between these two images?", uploaded_file, # Use the uploaded file reference types.Part.from_bytes( data=img2_bytes, mime_type='image/png' ) ] ) print(response.text)
from google.genai import types import { GoogleGenAI, createUserContent, createPartFromUri, } from "@google/genai"; import * as fs from "node:fs"; const ai = new GoogleGenAI({ apiKey: "GOOGLE_API_KEY" }); async function main() { // Upload the first image const image1_path = "path/to/image1.jpg"; const uploadedFile = await ai.files.upload({ file: image1_path, config: { mimeType: "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png"; const base64Image2File = fs.readFileSync(image2_path, { encoding: "base64", }); // Create the prompt with text and multiple images const response = await ai.models.generateContent({ model: "gemini-2.0-flash", contents: createUserContent([ "What is different between these two images?", createPartFromUri(uploadedFile.uri, uploadedFile.mimeType), { inlineData: { mimeType: "image/png", data: base64Image2File, }, }, ]), }); console.log(response.text); } await main();
from google.genai import types + // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile, err := client.UploadFileFromPath(ctx, image1Path, nil) if err != nil { log.Fatal(err) } defer client.DeleteFile(ctx, uploadedFile.Name) // Prepare the second image as inline data image2Path := "path/to/image2.png" img2Bytes, err := os.ReadFile(image2Path) if err != nil { log.Fatal(err) } // Create the prompt with text and multiple images model := client.GeneratedModel("gemini-2.0-flash") prompt := []genai.Part{ genai.Text("What is different between these two images?"), genai.FileData{URI: uploadedFile.URI}, genai.Blob{MIMEType: "image/png", Data: img2Bytes}, } resp, err := model.GenerateContent(ctx, prompt...) if err != nil { log.Fatal(err) } printResponse(resp)
from google.genai import types # Upload the first image IMAGE1_PATH="path/to/image1.jpg" MIME1_TYPE=$(file -b --mime-type "${IMAGE1_PATH}") NUM1_BYTES=$(wc -c < "${IMAGE1_PATH}") DISPLAY_NAME1=IMAGE1 tmp_header_file1=upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files?key=${GOOGLE_API_KEY}" -D upload-header1.tmp -H "X-Goog-Upload-Protocol: resumable" -H "X-Goog-Upload-Command: start" -H "X-Goog-Upload-Header-Content-Length: ${NUM1_BYTES}" -H "X-Goog-Upload-Header-Content-Type: ${MIME1_TYPE}" -H "Content-Type: application/json" -d "{'file': {'display_name': '${DISPLAY_NAME1}'}}" 2> /dev/null upload_url1=$(grep -i "x-goog-upload-url: " "${tmp_header_file1}" | cut -d" " -f2 | tr -d "r") rm "${tmp_header_file1}" curl "${upload_url1}" -H "Content-Length: ${NUM1_BYTES}" -H "X-Goog-Upload-Offset: 0" -H "X-Goog-Upload-Command: upload, finalize" --data-binary "@${IMAGE1_PATH}" 2> /dev/null> file_info1.json file1_uri=$(jq ".file.uri" file_info1.json) echo file1_uri=$file1_uri # Prepare the second image (inline) IMAGE2_PATH="path/to/image2.png" MIME2_TYPE=$(file -b --mime-type "${IMAGE2_PATH}") if [[ "$(base64 --version 2>&1)" = *"FreeBSD"* ]]; then B64FLAGS="--input" else B64FLAGS="-w0" fi IMAGE2_BASE64=$(base64 $B64FLAGS $IMAGE2_PATH) # Now generate content using both images curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=$GOOGLE_API_KEY" -H 'Content-Type: application/json' -X POST -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "'"${MIME1_TYPE}"'", "file_uri": '$file1_uri'}}, { "inline_data": { "mime_type":"'"${MIME2_TYPE}"'", "data": "'"$IMAGE2_BASE64"'" } } ] }] }' 2> /dev/null> response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json
The Gemini model is trained to recognize objects in the picture and provide their bounding box coordinates. The returned coordinates are scaled to [0, 1000] relative to the image size. You need to reduce these coordinates based on the original image size.
from google.genai import types prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000."
from google.genai import types const prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000.";
from google.genai import types prompt := []genai.Part{ genai.FileData{URI: sampleImage.URI}, genai.Text("Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000."), }
from google.genai import types PROMPT="Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000."
You can use bounding boxes to detect and locate objects in pictures and videos. By using bounding boxes to accurately identify and divide objects, you can unlock various applications and improve the intelligence of your projects.
1. Simple: Whether you have computer vision expertise or not, you can easily integrate object detection into your application.
2. Customizable: Generate bounding boxes of all the green objects in this image without training a custom model.
1. Enter: Prompt and associated picture or video frame.
2. Output: Bounding box, in the format [y_min, x_min, y_max, x_max]. The upper left corner is the origin. The x-axis is the horizontal axis and the y-axis is the vertical axis. The coordinate values of each picture are standardized, with a range of 0-1000.
3. Visualization: AI Studio users will see the drawn bounding box in the interface.
For Python developers, try 2D Space Understanding Notebook or Experimental 3D Pointing Notebook.
The model returns the bounding box coordinates in the format [y_min, x_min, y_max, x_max]. To convert these normalized coordinates to pixel coordinates of the original image, follow these steps:
1. Divide each output coordinate by 1000.
2. Multiply the x coordinate by the original image width.
3. Multiply the y coordinate by the height of the original image.
For more detailed examples of generating bounding box coordinates and visualizing these coordinates on images, see the Object Detection recipe example.
Starting with the Gemini 2.5 model, the Gemini model is not only trained to detect items, but also segments them and provides contour masks.
The model predicts a JSON list where each item represents a segmentation mask. Each item has a bounding box ("box_2d"), in the format [y0, x0, y1, x1], with normalized coordinates between 0 and 1000, a label used to identify the object ("label"), and finally a segmentation mask within the bounding box, represented in the base64-encoded png format, that is, a probability graph with values between 0 and 255. The mask needs to be resized to match the size of the bounding box, and then binarized according to the confidence threshold (middle point is 127).
from google.genai import types prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use describe labels. """
from google.genai import types const prompt = ` Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use describe labels. `;
from google.genai import types prompt := []genai.Part{ genai.FileData{URI: sampleImage.URI}, genai.Text(` Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use describe labels. `), }
from google.genai import types PROMPT=''' Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use describe labels. '''
Mask of wooden and glass objects in the picture
Gemini supports the following image format MIME types:
1. PNG - image/png
2. JPEG - image/jpeg
3. WEBP - image/webp
4. HEIC - image/heic
5. HEIF - image/heif
1. File count limit: Gemini 2.5 Pro, 2.0 Flash, 1.5 Pro, and 1.5 Flash support up to 3,600 image files per request.
2. Token calculation:
Gemini 1.5 Flash and Gemini 1.5 Pro: 258 tokens if both sizes are less than or equal to 384 pixels. The system will divide the larger image into multiple tiles (each tile is 256 pixels minimum and 768 pixels maximum, adjusted to 768x768), and the cost per tile is 258 tokens.
Gemini 2.0 Flash: If both dimensions are less than or equal to 384 pixels, then 258 tokens. The system will divide the larger image into 768x768 pixel tiles, with each tiling costing 258 tokens.
3. Best Practices:
Make sure the image has rotated correctly.
Use clear, unfuzzy pictures.
When using a single image with text, place the text prompts behind the image section in the contents array.