feat: add look_at tool and multimodal-looker agent
Add a new tool and agent for analyzing media files (PDFs, images, diagrams) that require visual interpretation beyond raw text. - Add `multimodal-looker` agent using Gemini 2.5 Flash model - Add `look_at` tool that spawns multimodal-looker sessions - Restrict multimodal-looker from calling task/call_omo_agent/look_at tools Inspired by Sourcegraph Ampcode's look_at tool design. 🤖 GENERATED WITH ASSISTANCE OF [OhMyOpenCode](https://github.com/code-yeongyu/oh-my-opencode)
This commit is contained in:
@@ -4,6 +4,7 @@ import { librarianAgent } from "./librarian"
|
|||||||
import { exploreAgent } from "./explore"
|
import { exploreAgent } from "./explore"
|
||||||
import { frontendUiUxEngineerAgent } from "./frontend-ui-ux-engineer"
|
import { frontendUiUxEngineerAgent } from "./frontend-ui-ux-engineer"
|
||||||
import { documentWriterAgent } from "./document-writer"
|
import { documentWriterAgent } from "./document-writer"
|
||||||
|
import { multimodalLookerAgent } from "./multimodal-looker"
|
||||||
|
|
||||||
export const builtinAgents: Record<string, AgentConfig> = {
|
export const builtinAgents: Record<string, AgentConfig> = {
|
||||||
oracle: oracleAgent,
|
oracle: oracleAgent,
|
||||||
@@ -11,6 +12,7 @@ export const builtinAgents: Record<string, AgentConfig> = {
|
|||||||
explore: exploreAgent,
|
explore: exploreAgent,
|
||||||
"frontend-ui-ux-engineer": frontendUiUxEngineerAgent,
|
"frontend-ui-ux-engineer": frontendUiUxEngineerAgent,
|
||||||
"document-writer": documentWriterAgent,
|
"document-writer": documentWriterAgent,
|
||||||
|
"multimodal-looker": multimodalLookerAgent,
|
||||||
}
|
}
|
||||||
|
|
||||||
export * from "./types"
|
export * from "./types"
|
||||||
|
|||||||
42
src/agents/multimodal-looker.ts
Normal file
42
src/agents/multimodal-looker.ts
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
import type { AgentConfig } from "@opencode-ai/sdk"
|
||||||
|
|
||||||
|
export const multimodalLookerAgent: AgentConfig = {
|
||||||
|
description:
|
||||||
|
"Analyze media files (PDFs, images, diagrams) that require interpretation beyond raw text. Extracts specific information or summaries from documents, describes visual content. Use when you need analyzed/extracted data rather than literal file contents.",
|
||||||
|
mode: "subagent",
|
||||||
|
model: "google/gemini-2.5-flash",
|
||||||
|
temperature: 0.1,
|
||||||
|
tools: { Read: true },
|
||||||
|
prompt: `You interpret media files that cannot be read as plain text.
|
||||||
|
|
||||||
|
Your job: examine the attached file and extract ONLY what was requested.
|
||||||
|
|
||||||
|
When to use you:
|
||||||
|
- Media files the Read tool cannot interpret
|
||||||
|
- Extracting specific information or summaries from documents
|
||||||
|
- Describing visual content in images or diagrams
|
||||||
|
- When analyzed/extracted data is needed, not raw file contents
|
||||||
|
|
||||||
|
When NOT to use you:
|
||||||
|
- Source code or plain text files needing exact contents (use Read)
|
||||||
|
- Files that need editing afterward (need literal content from Read)
|
||||||
|
- Simple file reading where no interpretation is needed
|
||||||
|
|
||||||
|
How you work:
|
||||||
|
1. Receive a file path and a goal describing what to extract
|
||||||
|
2. Read and analyze the file deeply
|
||||||
|
3. Return ONLY the relevant extracted information
|
||||||
|
4. The main agent never processes the raw file - you save context tokens
|
||||||
|
|
||||||
|
For PDFs: extract text, structure, tables, data from specific sections
|
||||||
|
For images: describe layouts, UI elements, text, diagrams, charts
|
||||||
|
For diagrams: explain relationships, flows, architecture depicted
|
||||||
|
|
||||||
|
Response rules:
|
||||||
|
- Return extracted information directly, no preamble
|
||||||
|
- If info not found, state clearly what's missing
|
||||||
|
- Match the language of the request
|
||||||
|
- Be thorough on the goal, concise on everything else
|
||||||
|
|
||||||
|
Your output goes straight to the main agent for continued work.`,
|
||||||
|
}
|
||||||
@@ -6,6 +6,7 @@ export type AgentName =
|
|||||||
| "explore"
|
| "explore"
|
||||||
| "frontend-ui-ux-engineer"
|
| "frontend-ui-ux-engineer"
|
||||||
| "document-writer"
|
| "document-writer"
|
||||||
|
| "multimodal-looker"
|
||||||
|
|
||||||
export type AgentOverrideConfig = Partial<AgentConfig>
|
export type AgentOverrideConfig = Partial<AgentConfig>
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import { librarianAgent } from "./librarian"
|
|||||||
import { exploreAgent } from "./explore"
|
import { exploreAgent } from "./explore"
|
||||||
import { frontendUiUxEngineerAgent } from "./frontend-ui-ux-engineer"
|
import { frontendUiUxEngineerAgent } from "./frontend-ui-ux-engineer"
|
||||||
import { documentWriterAgent } from "./document-writer"
|
import { documentWriterAgent } from "./document-writer"
|
||||||
|
import { multimodalLookerAgent } from "./multimodal-looker"
|
||||||
import { deepMerge } from "../shared"
|
import { deepMerge } from "../shared"
|
||||||
|
|
||||||
const allBuiltinAgents: Record<AgentName, AgentConfig> = {
|
const allBuiltinAgents: Record<AgentName, AgentConfig> = {
|
||||||
@@ -13,6 +14,7 @@ const allBuiltinAgents: Record<AgentName, AgentConfig> = {
|
|||||||
explore: exploreAgent,
|
explore: exploreAgent,
|
||||||
"frontend-ui-ux-engineer": frontendUiUxEngineerAgent,
|
"frontend-ui-ux-engineer": frontendUiUxEngineerAgent,
|
||||||
"document-writer": documentWriterAgent,
|
"document-writer": documentWriterAgent,
|
||||||
|
"multimodal-looker": multimodalLookerAgent,
|
||||||
}
|
}
|
||||||
|
|
||||||
function mergeAgentConfig(
|
function mergeAgentConfig(
|
||||||
|
|||||||
12
src/index.ts
12
src/index.ts
@@ -41,7 +41,7 @@ import {
|
|||||||
getCurrentSessionTitle,
|
getCurrentSessionTitle,
|
||||||
} from "./features/claude-code-session-state";
|
} from "./features/claude-code-session-state";
|
||||||
import { updateTerminalTitle } from "./features/terminal";
|
import { updateTerminalTitle } from "./features/terminal";
|
||||||
import { builtinTools, createCallOmoAgent, createBackgroundTools } from "./tools";
|
import { builtinTools, createCallOmoAgent, createBackgroundTools, createLookAt } from "./tools";
|
||||||
import { BackgroundManager } from "./features/background-agent";
|
import { BackgroundManager } from "./features/background-agent";
|
||||||
import { createBuiltinMcps } from "./mcp";
|
import { createBuiltinMcps } from "./mcp";
|
||||||
import { OhMyOpenCodeConfigSchema, type OhMyOpenCodeConfig, type HookName } from "./config";
|
import { OhMyOpenCodeConfigSchema, type OhMyOpenCodeConfig, type HookName } from "./config";
|
||||||
@@ -218,6 +218,7 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => {
|
|||||||
const backgroundTools = createBackgroundTools(backgroundManager, ctx.client);
|
const backgroundTools = createBackgroundTools(backgroundManager, ctx.client);
|
||||||
|
|
||||||
const callOmoAgent = createCallOmoAgent(ctx, backgroundManager);
|
const callOmoAgent = createCallOmoAgent(ctx, backgroundManager);
|
||||||
|
const lookAt = createLookAt(ctx);
|
||||||
|
|
||||||
const googleAuthHooks = pluginConfig.google_auth
|
const googleAuthHooks = pluginConfig.google_auth
|
||||||
? await createGoogleAntigravityAuthPlugin(ctx)
|
? await createGoogleAntigravityAuthPlugin(ctx)
|
||||||
@@ -230,6 +231,7 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => {
|
|||||||
...builtinTools,
|
...builtinTools,
|
||||||
...backgroundTools,
|
...backgroundTools,
|
||||||
call_omo_agent: callOmoAgent,
|
call_omo_agent: callOmoAgent,
|
||||||
|
look_at: lookAt,
|
||||||
},
|
},
|
||||||
|
|
||||||
"chat.message": async (input, output) => {
|
"chat.message": async (input, output) => {
|
||||||
@@ -268,6 +270,14 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => {
|
|||||||
call_omo_agent: false,
|
call_omo_agent: false,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
if (config.agent["multimodal-looker"]) {
|
||||||
|
config.agent["multimodal-looker"].tools = {
|
||||||
|
...config.agent["multimodal-looker"].tools,
|
||||||
|
task: false,
|
||||||
|
call_omo_agent: false,
|
||||||
|
look_at: false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
const mcpResult = (pluginConfig.claude_code?.mcp ?? true)
|
const mcpResult = (pluginConfig.claude_code?.mcp ?? true)
|
||||||
? await loadMcpConfigs()
|
? await loadMcpConfigs()
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ import type { BackgroundManager } from "../features/background-agent"
|
|||||||
type OpencodeClient = PluginInput["client"]
|
type OpencodeClient = PluginInput["client"]
|
||||||
|
|
||||||
export { createCallOmoAgent } from "./call-omo-agent"
|
export { createCallOmoAgent } from "./call-omo-agent"
|
||||||
|
export { createLookAt } from "./look-at"
|
||||||
|
|
||||||
export function createBackgroundTools(manager: BackgroundManager, client: OpencodeClient) {
|
export function createBackgroundTools(manager: BackgroundManager, client: OpencodeClient) {
|
||||||
return {
|
return {
|
||||||
|
|||||||
23
src/tools/look-at/constants.ts
Normal file
23
src/tools/look-at/constants.ts
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
export const MULTIMODAL_LOOKER_AGENT = "multimodal-looker" as const
|
||||||
|
|
||||||
|
export const LOOK_AT_DESCRIPTION = `Analyze media files (PDFs, images, diagrams) that require visual interpretation.
|
||||||
|
|
||||||
|
Use this tool to extract specific information from files that cannot be processed as plain text:
|
||||||
|
- PDF documents: extract text, tables, structure, specific sections
|
||||||
|
- Images: describe layouts, UI elements, text content, diagrams
|
||||||
|
- Charts/Graphs: explain data, trends, relationships
|
||||||
|
- Screenshots: identify UI components, text, visual elements
|
||||||
|
- Architecture diagrams: explain flows, connections, components
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- file_path: Absolute path to the file to analyze
|
||||||
|
- goal: What specific information to extract (be specific for better results)
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- "Extract all API endpoints from this OpenAPI spec PDF"
|
||||||
|
- "Describe the UI layout and components in this screenshot"
|
||||||
|
- "Explain the data flow in this architecture diagram"
|
||||||
|
- "List all table data from page 3 of this PDF"
|
||||||
|
|
||||||
|
This tool uses a separate context window with Gemini 2.5 Flash for multimodal analysis,
|
||||||
|
saving tokens in the main conversation while providing accurate visual interpretation.`
|
||||||
3
src/tools/look-at/index.ts
Normal file
3
src/tools/look-at/index.ts
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
export * from "./types"
|
||||||
|
export * from "./constants"
|
||||||
|
export { createLookAt } from "./tools"
|
||||||
91
src/tools/look-at/tools.ts
Normal file
91
src/tools/look-at/tools.ts
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
import { tool, type PluginInput } from "@opencode-ai/plugin"
|
||||||
|
import { LOOK_AT_DESCRIPTION, MULTIMODAL_LOOKER_AGENT } from "./constants"
|
||||||
|
import type { LookAtArgs } from "./types"
|
||||||
|
import { log } from "../../shared/logger"
|
||||||
|
|
||||||
|
export function createLookAt(ctx: PluginInput) {
|
||||||
|
return tool({
|
||||||
|
description: LOOK_AT_DESCRIPTION,
|
||||||
|
args: {
|
||||||
|
file_path: tool.schema.string().describe("Absolute path to the file to analyze"),
|
||||||
|
goal: tool.schema.string().describe("What specific information to extract from the file"),
|
||||||
|
},
|
||||||
|
async execute(args: LookAtArgs, toolContext) {
|
||||||
|
log(`[look_at] Analyzing file: ${args.file_path}, goal: ${args.goal}`)
|
||||||
|
|
||||||
|
const prompt = `Analyze this file and extract the requested information.
|
||||||
|
|
||||||
|
File path: ${args.file_path}
|
||||||
|
Goal: ${args.goal}
|
||||||
|
|
||||||
|
Read the file using the Read tool, then provide ONLY the extracted information that matches the goal.
|
||||||
|
Be thorough on what was requested, concise on everything else.
|
||||||
|
If the requested information is not found, clearly state what is missing.`
|
||||||
|
|
||||||
|
log(`[look_at] Creating session with parent: ${toolContext.sessionID}`)
|
||||||
|
const createResult = await ctx.client.session.create({
|
||||||
|
body: {
|
||||||
|
parentID: toolContext.sessionID,
|
||||||
|
title: `look_at: ${args.goal.substring(0, 50)}`,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
if (createResult.error) {
|
||||||
|
log(`[look_at] Session create error:`, createResult.error)
|
||||||
|
return `Error: Failed to create session: ${createResult.error}`
|
||||||
|
}
|
||||||
|
|
||||||
|
const sessionID = createResult.data.id
|
||||||
|
log(`[look_at] Created session: ${sessionID}`)
|
||||||
|
|
||||||
|
log(`[look_at] Sending prompt to session ${sessionID}`)
|
||||||
|
await ctx.client.session.prompt({
|
||||||
|
path: { id: sessionID },
|
||||||
|
body: {
|
||||||
|
agent: MULTIMODAL_LOOKER_AGENT,
|
||||||
|
tools: {
|
||||||
|
task: false,
|
||||||
|
call_omo_agent: false,
|
||||||
|
look_at: false,
|
||||||
|
},
|
||||||
|
parts: [{ type: "text", text: prompt }],
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
log(`[look_at] Prompt sent, fetching messages...`)
|
||||||
|
|
||||||
|
const messagesResult = await ctx.client.session.messages({
|
||||||
|
path: { id: sessionID },
|
||||||
|
})
|
||||||
|
|
||||||
|
if (messagesResult.error) {
|
||||||
|
log(`[look_at] Messages error:`, messagesResult.error)
|
||||||
|
return `Error: Failed to get messages: ${messagesResult.error}`
|
||||||
|
}
|
||||||
|
|
||||||
|
const messages = messagesResult.data
|
||||||
|
log(`[look_at] Got ${messages.length} messages`)
|
||||||
|
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
const lastAssistantMessage = messages
|
||||||
|
.filter((m: any) => m.info.role === "assistant")
|
||||||
|
.sort((a: any, b: any) => (b.info.time?.created || 0) - (a.info.time?.created || 0))[0]
|
||||||
|
|
||||||
|
if (!lastAssistantMessage) {
|
||||||
|
log(`[look_at] No assistant message found`)
|
||||||
|
return `Error: No response from multimodal-looker agent`
|
||||||
|
}
|
||||||
|
|
||||||
|
log(`[look_at] Found assistant message with ${lastAssistantMessage.parts.length} parts`)
|
||||||
|
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
const textParts = lastAssistantMessage.parts.filter((p: any) => p.type === "text")
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
const responseText = textParts.map((p: any) => p.text).join("\n")
|
||||||
|
|
||||||
|
log(`[look_at] Got response, length: ${responseText.length}`)
|
||||||
|
|
||||||
|
return responseText
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
4
src/tools/look-at/types.ts
Normal file
4
src/tools/look-at/types.ts
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
export interface LookAtArgs {
|
||||||
|
file_path: string
|
||||||
|
goal: string
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user