diff --git a/src/agents/index.ts b/src/agents/index.ts index b4eaf5a..8f2c3ed 100644 --- a/src/agents/index.ts +++ b/src/agents/index.ts @@ -4,6 +4,7 @@ import { librarianAgent } from "./librarian" import { exploreAgent } from "./explore" import { frontendUiUxEngineerAgent } from "./frontend-ui-ux-engineer" import { documentWriterAgent } from "./document-writer" +import { multimodalLookerAgent } from "./multimodal-looker" export const builtinAgents: Record = { oracle: oracleAgent, @@ -11,6 +12,7 @@ export const builtinAgents: Record = { explore: exploreAgent, "frontend-ui-ux-engineer": frontendUiUxEngineerAgent, "document-writer": documentWriterAgent, + "multimodal-looker": multimodalLookerAgent, } export * from "./types" diff --git a/src/agents/multimodal-looker.ts b/src/agents/multimodal-looker.ts new file mode 100644 index 0000000..713f5d3 --- /dev/null +++ b/src/agents/multimodal-looker.ts @@ -0,0 +1,42 @@ +import type { AgentConfig } from "@opencode-ai/sdk" + +export const multimodalLookerAgent: AgentConfig = { + description: + "Analyze media files (PDFs, images, diagrams) that require interpretation beyond raw text. Extracts specific information or summaries from documents, describes visual content. Use when you need analyzed/extracted data rather than literal file contents.", + mode: "subagent", + model: "google/gemini-2.5-flash", + temperature: 0.1, + tools: { Read: true }, + prompt: `You interpret media files that cannot be read as plain text. + +Your job: examine the attached file and extract ONLY what was requested. + +When to use you: +- Media files the Read tool cannot interpret +- Extracting specific information or summaries from documents +- Describing visual content in images or diagrams +- When analyzed/extracted data is needed, not raw file contents + +When NOT to use you: +- Source code or plain text files needing exact contents (use Read) +- Files that need editing afterward (need literal content from Read) +- Simple file reading where no interpretation is needed + +How you work: +1. Receive a file path and a goal describing what to extract +2. Read and analyze the file deeply +3. Return ONLY the relevant extracted information +4. The main agent never processes the raw file - you save context tokens + +For PDFs: extract text, structure, tables, data from specific sections +For images: describe layouts, UI elements, text, diagrams, charts +For diagrams: explain relationships, flows, architecture depicted + +Response rules: +- Return extracted information directly, no preamble +- If info not found, state clearly what's missing +- Match the language of the request +- Be thorough on the goal, concise on everything else + +Your output goes straight to the main agent for continued work.`, +} diff --git a/src/agents/types.ts b/src/agents/types.ts index 7e338e8..e24d04d 100644 --- a/src/agents/types.ts +++ b/src/agents/types.ts @@ -6,6 +6,7 @@ export type AgentName = | "explore" | "frontend-ui-ux-engineer" | "document-writer" + | "multimodal-looker" export type AgentOverrideConfig = Partial diff --git a/src/agents/utils.ts b/src/agents/utils.ts index a45962d..64eca40 100644 --- a/src/agents/utils.ts +++ b/src/agents/utils.ts @@ -5,6 +5,7 @@ import { librarianAgent } from "./librarian" import { exploreAgent } from "./explore" import { frontendUiUxEngineerAgent } from "./frontend-ui-ux-engineer" import { documentWriterAgent } from "./document-writer" +import { multimodalLookerAgent } from "./multimodal-looker" import { deepMerge } from "../shared" const allBuiltinAgents: Record = { @@ -13,6 +14,7 @@ const allBuiltinAgents: Record = { explore: exploreAgent, "frontend-ui-ux-engineer": frontendUiUxEngineerAgent, "document-writer": documentWriterAgent, + "multimodal-looker": multimodalLookerAgent, } function mergeAgentConfig( diff --git a/src/index.ts b/src/index.ts index 84d2207..502e69e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -41,7 +41,7 @@ import { getCurrentSessionTitle, } from "./features/claude-code-session-state"; import { updateTerminalTitle } from "./features/terminal"; -import { builtinTools, createCallOmoAgent, createBackgroundTools } from "./tools"; +import { builtinTools, createCallOmoAgent, createBackgroundTools, createLookAt } from "./tools"; import { BackgroundManager } from "./features/background-agent"; import { createBuiltinMcps } from "./mcp"; import { OhMyOpenCodeConfigSchema, type OhMyOpenCodeConfig, type HookName } from "./config"; @@ -218,6 +218,7 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => { const backgroundTools = createBackgroundTools(backgroundManager, ctx.client); const callOmoAgent = createCallOmoAgent(ctx, backgroundManager); + const lookAt = createLookAt(ctx); const googleAuthHooks = pluginConfig.google_auth ? await createGoogleAntigravityAuthPlugin(ctx) @@ -230,6 +231,7 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => { ...builtinTools, ...backgroundTools, call_omo_agent: callOmoAgent, + look_at: lookAt, }, "chat.message": async (input, output) => { @@ -268,6 +270,14 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => { call_omo_agent: false, }; } + if (config.agent["multimodal-looker"]) { + config.agent["multimodal-looker"].tools = { + ...config.agent["multimodal-looker"].tools, + task: false, + call_omo_agent: false, + look_at: false, + }; + } const mcpResult = (pluginConfig.claude_code?.mcp ?? true) ? await loadMcpConfigs() diff --git a/src/tools/index.ts b/src/tools/index.ts index ccf2397..7392975 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -34,6 +34,7 @@ import type { BackgroundManager } from "../features/background-agent" type OpencodeClient = PluginInput["client"] export { createCallOmoAgent } from "./call-omo-agent" +export { createLookAt } from "./look-at" export function createBackgroundTools(manager: BackgroundManager, client: OpencodeClient) { return { diff --git a/src/tools/look-at/constants.ts b/src/tools/look-at/constants.ts new file mode 100644 index 0000000..6ffa930 --- /dev/null +++ b/src/tools/look-at/constants.ts @@ -0,0 +1,23 @@ +export const MULTIMODAL_LOOKER_AGENT = "multimodal-looker" as const + +export const LOOK_AT_DESCRIPTION = `Analyze media files (PDFs, images, diagrams) that require visual interpretation. + +Use this tool to extract specific information from files that cannot be processed as plain text: +- PDF documents: extract text, tables, structure, specific sections +- Images: describe layouts, UI elements, text content, diagrams +- Charts/Graphs: explain data, trends, relationships +- Screenshots: identify UI components, text, visual elements +- Architecture diagrams: explain flows, connections, components + +Parameters: +- file_path: Absolute path to the file to analyze +- goal: What specific information to extract (be specific for better results) + +Examples: +- "Extract all API endpoints from this OpenAPI spec PDF" +- "Describe the UI layout and components in this screenshot" +- "Explain the data flow in this architecture diagram" +- "List all table data from page 3 of this PDF" + +This tool uses a separate context window with Gemini 2.5 Flash for multimodal analysis, +saving tokens in the main conversation while providing accurate visual interpretation.` diff --git a/src/tools/look-at/index.ts b/src/tools/look-at/index.ts new file mode 100644 index 0000000..5b78a6d --- /dev/null +++ b/src/tools/look-at/index.ts @@ -0,0 +1,3 @@ +export * from "./types" +export * from "./constants" +export { createLookAt } from "./tools" diff --git a/src/tools/look-at/tools.ts b/src/tools/look-at/tools.ts new file mode 100644 index 0000000..9074b29 --- /dev/null +++ b/src/tools/look-at/tools.ts @@ -0,0 +1,91 @@ +import { tool, type PluginInput } from "@opencode-ai/plugin" +import { LOOK_AT_DESCRIPTION, MULTIMODAL_LOOKER_AGENT } from "./constants" +import type { LookAtArgs } from "./types" +import { log } from "../../shared/logger" + +export function createLookAt(ctx: PluginInput) { + return tool({ + description: LOOK_AT_DESCRIPTION, + args: { + file_path: tool.schema.string().describe("Absolute path to the file to analyze"), + goal: tool.schema.string().describe("What specific information to extract from the file"), + }, + async execute(args: LookAtArgs, toolContext) { + log(`[look_at] Analyzing file: ${args.file_path}, goal: ${args.goal}`) + + const prompt = `Analyze this file and extract the requested information. + +File path: ${args.file_path} +Goal: ${args.goal} + +Read the file using the Read tool, then provide ONLY the extracted information that matches the goal. +Be thorough on what was requested, concise on everything else. +If the requested information is not found, clearly state what is missing.` + + log(`[look_at] Creating session with parent: ${toolContext.sessionID}`) + const createResult = await ctx.client.session.create({ + body: { + parentID: toolContext.sessionID, + title: `look_at: ${args.goal.substring(0, 50)}`, + }, + }) + + if (createResult.error) { + log(`[look_at] Session create error:`, createResult.error) + return `Error: Failed to create session: ${createResult.error}` + } + + const sessionID = createResult.data.id + log(`[look_at] Created session: ${sessionID}`) + + log(`[look_at] Sending prompt to session ${sessionID}`) + await ctx.client.session.prompt({ + path: { id: sessionID }, + body: { + agent: MULTIMODAL_LOOKER_AGENT, + tools: { + task: false, + call_omo_agent: false, + look_at: false, + }, + parts: [{ type: "text", text: prompt }], + }, + }) + + log(`[look_at] Prompt sent, fetching messages...`) + + const messagesResult = await ctx.client.session.messages({ + path: { id: sessionID }, + }) + + if (messagesResult.error) { + log(`[look_at] Messages error:`, messagesResult.error) + return `Error: Failed to get messages: ${messagesResult.error}` + } + + const messages = messagesResult.data + log(`[look_at] Got ${messages.length} messages`) + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const lastAssistantMessage = messages + .filter((m: any) => m.info.role === "assistant") + .sort((a: any, b: any) => (b.info.time?.created || 0) - (a.info.time?.created || 0))[0] + + if (!lastAssistantMessage) { + log(`[look_at] No assistant message found`) + return `Error: No response from multimodal-looker agent` + } + + log(`[look_at] Found assistant message with ${lastAssistantMessage.parts.length} parts`) + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const textParts = lastAssistantMessage.parts.filter((p: any) => p.type === "text") + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const responseText = textParts.map((p: any) => p.text).join("\n") + + log(`[look_at] Got response, length: ${responseText.length}`) + + return responseText + }, + }) +} diff --git a/src/tools/look-at/types.ts b/src/tools/look-at/types.ts new file mode 100644 index 0000000..3d3f0b0 --- /dev/null +++ b/src/tools/look-at/types.ts @@ -0,0 +1,4 @@ +export interface LookAtArgs { + file_path: string + goal: string +}