Context Caching
View Original →Gemini 3.1 Flash-Lite Preview is now available. Try it in AI Studio.
Home
Gemini API
Docs
Send feedback
Context caching
In a typical AI workflow, you might pass the same input tokens over and over to
a model. The Gemini API offers two different caching mechanisms:
- Implicit caching (automatically enabled on most Gemini models, no cost saving guarantee)
- Explicit caching (can be manually enabled on most models, cost saving guarantee)
Explicit caching is useful in cases where you want to guarantee cost savings,
but with some added developer work.
Implicit caching
Implicit caching is enabled by default and available for most Gemini models. We automatically
pass on cost savings if your request hits caches. There is nothing you need to do
in order to enable this. It is effective as of May 8th, 2025. The minimum input
token count for context caching is listed in the following table for each model:
Model
Min token limit
Gemini 3.1 Pro Preview
4096
Gemini 3 Flash Preview
1024
Gemini 2.5 Flash
1024
Gemini 2.5 Pro
4096
To increase the chance of an implicit cache hit:
- Try putting large and common contents at the beginning of your prompt
- Try to send requests with similar prefix in a short amount of time
You can see the number of tokens which were cache hits in the response object's
usage_metadata field.
Explicit caching
Using the Gemini API explicit caching feature, you can pass some content
to the model once, cache the input tokens, and then refer to the cached tokens
for subsequent requests. At certain volumes, using cached tokens is lower cost
than passing in the same corpus of tokens repeatedly.
When you cache a set of tokens, you can choose how long you want the cache to
exist before the tokens are automatically deleted. This caching duration is
called the time to live (TTL). If not set, the TTL defaults to 1 hour. The
cost for caching depends on the input token size and how long you want the
tokens to persist.
This section assumes that you've installed a Gemini SDK (or have curl installed)
and that you've configured an API key, as shown in the
quickstart.Generate content using a cache
Python
The following example shows how to generate content using a cached system
instruction and video file.
Videos
import os
import pathlib
import requests
import time
from google import genai
from google.genai import types
client = genai.Client()
Download a test video file and save it locally
url = 'https://storage.googleapis.com/generativeai-downloads/data/SherlockJr._10min.mp4'
path_to_video_file = pathlib.Path('SherlockJr._10min.mp4')
if not path_to_video_file.exists():
path_to_video_file.write_bytes(requests.get(url).content)
Upload the video using the Files API
video_file = client.files.upload(file=path_to_video_file)
Wait for the file to finish processing
while video_file.state.name == 'PROCESSING':
time.sleep(2.5)
video_file = client.files.get(name=video_file.name)
print(f'Video processing complete: {video_file.uri}')
model='models/gemini-3-flash-preview'
Create a cache with a 5 minute TTL (300 seconds)
cache = client.caches.create(
model=model,
config=types.CreateCachedContentConfig(
display_name='sherlock jr movie', # used to identify the cache
system_instruction=(
'You are an expert video analyzer, and your job is to answer '
'the user\'s query based on the video file you have access to.'
),
contents=[video_file],
ttl="300s",
)
)
response = client.models.generate_content(
model = model,
contents= (
'Introduce different characters in the movie by describing '
'their personality, looks, and names. Also list the timestamps '
'they were introduced for the first time.'),
config=types.GenerateContentConfig(cached_content=cache.name)
)
print(response.usage_metadata)
print(response.text)
PDFs
from google import genai
from google.genai import types
import io
import httpx
client = genai.Client()
long_context_pdf_path = "https://sma.nasa.gov/SignificantIncidents/assets/a11_missionreport.pdf"
Retrieve and upload the PDF using the File API
doc_io = io.BytesIO(httpx.get(long_context_pdf_path).content)
document = client.files.upload(
file=doc_io,
config=dict(mime_type='application/pdf')
)
model_name = "gemini-3-flash-preview"
system_instruction = "You are an expert analyzing transcripts."
Create a cached content object
cache = client.caches.create(
model=model_name,
config=types.CreateCachedContentConfig(
system_instruction=system_instruction,
contents=[document],
)
)
print(f'{cache=}')
response = client.models.generate_content(
model=model_name,
contents="Please summarize this transcript",
config=types.GenerateContentConfig(
cached_content=cache.name
))
print(f'{response.usage_metadata=}')
print('\n\n', response.text)
JavaScript
The following example shows how to generate content using a cached system
instruction and a text file.
import {
GoogleGenAI,
createUserContent,
createPartFromUri,
} from "@google/genai";
const ai = new GoogleGenAI({ apiKey: "GEMINI_API_KEY" });
async function main() {
const doc = await ai.files.upload({
file: "path/to/file.txt",
config: { mimeType: "text/plain" },
});
console.log("Uploaded file name:", doc.name);
const modelName = "gemini-3-flash-preview";
const cache = await ai.caches.create({
model: modelName,
config: {
contents: createUserContent(createPartFromUri(doc.uri, doc.mimeType)),
systemInstruction: "You are an expert analyzing transcripts.",
},
});
console.log("Cache created:", cache);
const response = await ai.models.generateContent({
model: modelName,
contents: "Please summarize this transcript",
config: { cachedContent: cache.name },
});
console.log("Response text:", response.text);
}
await main();
Go
The following example shows how to generate content using a cache.
package main
import (
"context"
"fmt"
"log"
"google.golang.org/genai"
)
func main() {
ctx := context.Background()
client, err := genai.NewClient(ctx, &genai.ClientConfig{
APIKey: "GOOGLE_API_KEY",
Backend: genai.BackendGeminiAPI,
})
if err != nil {
log.Fatal(err)
}
modelName := "gemini-3-flash-preview"
document, err := client.Files.UploadFromPath(
ctx,
"media/a11.txt",
&genai.UploadFileConfig{
MIMEType: "text/plain",
},
)
if err != nil {
log.Fatal(err)
}
parts := []*genai.Part{
genai.NewPartFromURI(document.URI, document.MIMEType),
}
contents := []*genai.Content{
genai.NewContentFromParts(parts, genai.RoleUser),
}
cache, err := client.Caches.Create(ctx, modelName, &genai.CreateCachedContentConfig{
Contents: contents,
SystemInstruction: genai.NewContentFromText(
"You are an expert analyzing transcripts.", genai.RoleUser,
),
})
if err != nil {
log.Fatal(err)
}
fmt.Println("Cache created:")
fmt.Println(cache)
// Use the cache for generating content.
response, err := client.Models.GenerateContent(
ctx,
modelName,
genai.Text("Please summarize this transcript"),
&genai.GenerateContentConfig{
CachedContent: cache.Name,
},
)
if err != nil {
log.Fatal(err)
}
printResponse(response) // helper for printing response parts
}
REST
The following example shows how to create a cache and then use it to
generate content.
Videos
wget https://storage.googleapis.com/generativeai-downloads/data/a11.txt
echo '{
"model": "models/gemini-3-flash-preview",
"contents":[
{
"parts":[
{
"inline_data": {
"mime_type":"text/plain",
"data": "'$(base64 $B64FLAGS a11.txt)'"
}
}
],
"role": "user"
}
],
"systemInstruction": {
"parts": [
{
"text": "You are an expert at analyzing transcripts."
}
]
},
"ttl": "300s"
}' > request.json
curl -X POST "https://generativelanguage.googleapis.com/v1beta/cachedContents?key=$GEMINI_API_KEY" \
-H 'Content-Type: application/json' \
-d @request.json \
cache.json
CACHE_NAME=$(cat cache.json | grep '"name":' | cut -d '"' -f 4 | head -n 1)
curl -X POST "https://generativelanguage.googleapis.com/v1beta/models/gemini-3-flash-preview:generateContent?key=$GEMINI_API_KEY" \
-H 'Content-Type: application/json' \
-d '{
"contents": [
{
"parts":[{
"text": "Please summarize this transcript"
}],
"role": "user"
},
],
"cachedContent": "'$CACHE_NAME'"
}'
PDFs
``
DOC_URL="https://sma.nasa.gov/SignificantIncidents/assets/a11_missionreport.pdf"
DISPLAY_NAME="A11_Mission_Report"
SYSTEM_INSTRUCTION="You are an expert at analyzing transcripts."
PROMPT="Please summarize this transcript"
MODEL="models/gemini-3-flash-preview"
TTL="300s"
Download the PDF
wget -O "${DISPLAY_NAME}.pdf" "${DOC_URL}"
MIME_TYPE=$(file -b --mime-type "${DISPLAY_NAME}.pdf")
NUM_BYTES=$(wc -c < "${DISPLAY_NAME}.pdf")
echo "MIME_TYPE: ${MIME_TYPE}"
echo "NUM_BYTES: ${NUM_BYTES}"
tmp_heade