Lesson 17: Multimodal Agents (Vision + Tools)
This advanced lesson demonstrates how to build a powerful agent that can see and act by combining multimodal vision capabilities with tool use.
Code: lesson_17_multimodal_agents.mjs
The key is to construct a message payload containing both the image (via createMediaMessage
) and the user's text prompt. The high-level .run()
agent seamlessly processes this multimodal input, allowing the model to analyze the image and then decide to call the appropriate tool based on its visual understanding.
// lesson_17_multimodal_agents.mjs
// Merci SDK Tutorial: Lesson 17 - Multimodal Agents (Vision + Tools)
// --- IMPORTS ---
import { MerciClient, createMediaMessage, createUserMessage } from '../lib/merci.2.14.0.mjs';
import { token } from '../secret/token.mjs';
// This advanced feature requires a powerful multimodal model.
const MODEL = 'openai-gpt-5-mini';
// --- TOOL DEFINITION ---
const inventoryTool = {
name: 'add_product_to_inventory',
parameters: {
schema: {
type: 'object',
properties: {
product_name: { type: 'string', description: 'The name of the product, e.g., "Tomato Soup".' },
brand: { type: 'string', description: 'The brand of the product, e.g., "Campbell\'s".' },
quantity: { type: 'number', description: 'The number of items to add.' },
},
required: ['product_name', 'brand', 'quantity'],
}
},
execute: async ({ product_name, brand, quantity }) => {
console.log(`\n[TOOL EXECUTING] DATABASE WRITE: Adding ${quantity} x ${brand} ${product_name}.`);
return { success: true, productId: `prod_${Date.now()}` };
},
};
async function main() {
console.log(`--- Merci SDK Tutorial: Lesson 17 - Multimodal Agents (Model: ${MODEL}) ---`);
console.log('NOTE: This lesson requires an `image.png` file in the same directory.');
try {
// --- STEP 1: INITIALIZE THE CLIENT ---
console.log('\n[STEP 1] Initializing MerciClient...');
const client = new MerciClient({ token });
client.on('tool_start', ({ calls }) => {
console.log(`\n[EVENT: tool_start] Model is calling '${calls[0].name}' with args: ${calls[0].arguments}`);
});
// --- STEP 2: PREPARE PROMPT, IMAGE, AND TOOL ---
console.log('[STEP 2] Preparing image, prompt, and tool definition...');
const imagePath = './image.png';
const userPrompt = "Analyze the attached image and use the provided tool to add the product to our inventory.";
const tools = [inventoryTool];
// --- STEP 3: CONFIGURE THE MULTIMODAL AGENT ---
console.log('[STEP 3] Configuring the agent with the inventory tool...');
const agent = client.chat.session(MODEL).withTools(tools);
// --- STEP 4: PREPARE THE MULTIMODAL MESSAGE PAYLOAD ---
console.log('[STEP 4] Creating the message payload with both image and text...');
const messages = [
await createMediaMessage(imagePath),
createUserMessage(userPrompt)
];
// --- STEP 5: RUN THE AGENT ---
console.log('[STEP 5] Running the agent. The model will first see the image, then decide to call the tool...');
const finalAnswer = await agent.run(messages);
// --- FINAL RESULT ---
console.log('\n\n--- FINAL RESULT ---');
console.log(`🖼️ Media > ${imagePath}`);
console.log(`👤 User > ${userPrompt}`);
console.log(`🤖 Assistant > ${finalAnswer}`);
console.log('--------------------');
console.log('The model successfully interpreted the image content and used it to populate the tool\'s arguments.');
} catch (error) {
if (error.code === 'ENOENT') {
console.error(`\n[FATAL ERROR] Image file not found at "${error.path}"`);
console.error(' Please make sure an `image.png` file exists before running the script.');
process.exit(1);
}
console.error('\n\n[FATAL ERROR] An error occurred during the operation.');
console.error(' Message:', error.message);
if (error.status) { console.error(' API Status:', error.status); }
if (error.details) { console.error(' Details:', JSON.stringify(error.details, null, 2)); }
if (error.stack) { console.error(' Stack:', error.stack); }
console.error('\n Possible causes: Invalid token, network issues, or an API service problem.');
process.exit(1);
}
}
main().catch(console.error);
Expected Output
The agent will perform the following steps automatically: receive the image, analyze it to identify the product, determine the correct tool to use, call it with arguments extracted from the image, and finally formulate a confirmation message.
[EVENT: tool_start] Model is calling 'add_product_to_inventory' with args: {"product_name":"Tomato Soup","brand":"Campbell's","quantity":1}
[TOOL EXECUTING] DATABASE WRITE: Adding 1 x Campbell's Tomato Soup.
--- FINAL RESULT ---
🖼️ Media > ./image.png
👤 User > Analyze the attached image and use the provided tool to add the product to our inventory.
🤖 Assistant > I have successfully added one can of Campbell's Tomato Soup to the inventory.
--------------------