feat: add look_at tool and multimodal-looker agent

Add a new tool and agent for analyzing media files (PDFs, images, diagrams) that require visual interpretation beyond raw text. - Add `multimodal-looker` agent using Gemini 2.5 Flash model - Add `look_at` tool that spawns multimodal-looker sessions - Restrict multimodal-looker from calling task/call_omo_agent/look_at tools Inspired by Sourcegraph Ampcode's look_at tool design. 🤖 GENERATED WITH ASSISTANCE OF [OhMyOpenCode](https://github.com/code-yeongyu/oh-my-opencode)
2025-12-13 15:25:29 +09:00
parent 821b0b8e9f
commit a3938e8c25
10 changed files with 180 additions and 1 deletions
--- a/src/agents/index.ts
+++ b/src/agents/index.ts
@@ -4,6 +4,7 @@ import { librarianAgent } from "./librarian"
 import { exploreAgent } from "./explore"
 import { frontendUiUxEngineerAgent } from "./frontend-ui-ux-engineer"
 import { documentWriterAgent } from "./document-writer"
 import { multimodalLookerAgent } from "./multimodal-looker"
 export const builtinAgents: Record<string, AgentConfig> = {
  oracle: oracleAgent,
@@ -11,6 +12,7 @@ export const builtinAgents: Record<string, AgentConfig> = {
  explore: exploreAgent,
  "frontend-ui-ux-engineer": frontendUiUxEngineerAgent,
  "document-writer": documentWriterAgent,
  "multimodal-looker": multimodalLookerAgent,
 }
 export * from "./types"
--- a/src/agents/multimodal-looker.ts
+++ b/src/agents/multimodal-looker.ts
@@ -0,0 +1,42 @@
 import type { AgentConfig } from "@opencode-ai/sdk"
 export const multimodalLookerAgent: AgentConfig = {
  description:
    "Analyze media files (PDFs, images, diagrams) that require interpretation beyond raw text. Extracts specific information or summaries from documents, describes visual content. Use when you need analyzed/extracted data rather than literal file contents.",
  mode: "subagent",
  model: "google/gemini-2.5-flash",
  temperature: 0.1,
  tools: { Read: true },
  prompt: `You interpret media files that cannot be read as plain text.
 Your job: examine the attached file and extract ONLY what was requested.
 When to use you:
 - Media files the Read tool cannot interpret
 - Extracting specific information or summaries from documents
 - Describing visual content in images or diagrams
 - When analyzed/extracted data is needed, not raw file contents
 When NOT to use you:
 - Source code or plain text files needing exact contents (use Read)
 - Files that need editing afterward (need literal content from Read)
 - Simple file reading where no interpretation is needed
 How you work:
 1. Receive a file path and a goal describing what to extract
 2. Read and analyze the file deeply
 3. Return ONLY the relevant extracted information
 4. The main agent never processes the raw file - you save context tokens
 For PDFs: extract text, structure, tables, data from specific sections
 For images: describe layouts, UI elements, text, diagrams, charts
 For diagrams: explain relationships, flows, architecture depicted
 Response rules:
 - Return extracted information directly, no preamble
 - If info not found, state clearly what's missing
 - Match the language of the request
 - Be thorough on the goal, concise on everything else
 Your output goes straight to the main agent for continued work.`,
 }
--- a/src/agents/types.ts
+++ b/src/agents/types.ts
@@ -6,6 +6,7 @@ export type AgentName =
  | "explore"
  | "frontend-ui-ux-engineer"
  | "document-writer"
  | "multimodal-looker"
 export type AgentOverrideConfig = Partial<AgentConfig>
--- a/src/agents/utils.ts
+++ b/src/agents/utils.ts
@@ -5,6 +5,7 @@ import { librarianAgent } from "./librarian"
 import { exploreAgent } from "./explore"
 import { frontendUiUxEngineerAgent } from "./frontend-ui-ux-engineer"
 import { documentWriterAgent } from "./document-writer"
 import { multimodalLookerAgent } from "./multimodal-looker"
 import { deepMerge } from "../shared"
 const allBuiltinAgents: Record<AgentName, AgentConfig> = {
@@ -13,6 +14,7 @@ const allBuiltinAgents: Record<AgentName, AgentConfig> = {
  explore: exploreAgent,
  "frontend-ui-ux-engineer": frontendUiUxEngineerAgent,
  "document-writer": documentWriterAgent,
  "multimodal-looker": multimodalLookerAgent,
 }
 function mergeAgentConfig(
--- a/src/index.ts
+++ b/src/index.ts
@@ -41,7 +41,7 @@ import {
  getCurrentSessionTitle,
 } from "./features/claude-code-session-state";
 import { updateTerminalTitle } from "./features/terminal";
-import { builtinTools, createCallOmoAgent, createBackgroundTools } from "./tools";
+import { builtinTools, createCallOmoAgent, createBackgroundTools, createLookAt } from "./tools";
 import { BackgroundManager } from "./features/background-agent";
 import { createBuiltinMcps } from "./mcp";
 import { OhMyOpenCodeConfigSchema, type OhMyOpenCodeConfig, type HookName } from "./config";
@@ -218,6 +218,7 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => {
  const backgroundTools = createBackgroundTools(backgroundManager, ctx.client);
  const callOmoAgent = createCallOmoAgent(ctx, backgroundManager);
  const lookAt = createLookAt(ctx);
  const googleAuthHooks = pluginConfig.google_auth
    ? await createGoogleAntigravityAuthPlugin(ctx)
@@ -230,6 +231,7 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => {
      ...builtinTools,
      ...backgroundTools,
      call_omo_agent: callOmoAgent,
      look_at: lookAt,
    },
    "chat.message": async (input, output) => {
@@ -268,6 +270,14 @@ const OhMyOpenCodePlugin: Plugin = async (ctx) => {
          call_omo_agent: false,
        };
      }
      if (config.agent["multimodal-looker"]) {
        config.agent["multimodal-looker"].tools = {
          ...config.agent["multimodal-looker"].tools,
          task: false,
          call_omo_agent: false,
          look_at: false,
        };
      }
      const mcpResult = (pluginConfig.claude_code?.mcp ?? true)
        ? await loadMcpConfigs()
--- a/src/tools/index.ts
+++ b/src/tools/index.ts
@@ -34,6 +34,7 @@ import type { BackgroundManager } from "../features/background-agent"
 type OpencodeClient = PluginInput["client"]
 export { createCallOmoAgent } from "./call-omo-agent"
 export { createLookAt } from "./look-at"
 export function createBackgroundTools(manager: BackgroundManager, client: OpencodeClient) {
  return {
--- a/src/tools/look-at/constants.ts
+++ b/src/tools/look-at/constants.ts
@@ -0,0 +1,23 @@
 export const MULTIMODAL_LOOKER_AGENT = "multimodal-looker" as const
 export const LOOK_AT_DESCRIPTION = `Analyze media files (PDFs, images, diagrams) that require visual interpretation.
 Use this tool to extract specific information from files that cannot be processed as plain text:
 - PDF documents: extract text, tables, structure, specific sections
 - Images: describe layouts, UI elements, text content, diagrams
 - Charts/Graphs: explain data, trends, relationships
 - Screenshots: identify UI components, text, visual elements
 - Architecture diagrams: explain flows, connections, components
 Parameters:
 - file_path: Absolute path to the file to analyze
 - goal: What specific information to extract (be specific for better results)
 Examples:
 - "Extract all API endpoints from this OpenAPI spec PDF"
 - "Describe the UI layout and components in this screenshot"
 - "Explain the data flow in this architecture diagram"
 - "List all table data from page 3 of this PDF"
 This tool uses a separate context window with Gemini 2.5 Flash for multimodal analysis,
 saving tokens in the main conversation while providing accurate visual interpretation.`
--- a/src/tools/look-at/index.ts
+++ b/src/tools/look-at/index.ts
@@ -0,0 +1,3 @@
 export * from "./types"
 export * from "./constants"
 export { createLookAt } from "./tools"
--- a/src/tools/look-at/tools.ts
+++ b/src/tools/look-at/tools.ts
@@ -0,0 +1,91 @@
 import { tool, type PluginInput } from "@opencode-ai/plugin"
 import { LOOK_AT_DESCRIPTION, MULTIMODAL_LOOKER_AGENT } from "./constants"
 import type { LookAtArgs } from "./types"
 import { log } from "../../shared/logger"
 export function createLookAt(ctx: PluginInput) {
  return tool({
    description: LOOK_AT_DESCRIPTION,
    args: {
      file_path: tool.schema.string().describe("Absolute path to the file to analyze"),
      goal: tool.schema.string().describe("What specific information to extract from the file"),
    },
    async execute(args: LookAtArgs, toolContext) {
      log(`[look_at] Analyzing file: ${args.file_path}, goal: ${args.goal}`)
      const prompt = `Analyze this file and extract the requested information.
 File path: ${args.file_path}
 Goal: ${args.goal}
 Read the file using the Read tool, then provide ONLY the extracted information that matches the goal.
 Be thorough on what was requested, concise on everything else.
 If the requested information is not found, clearly state what is missing.`
      log(`[look_at] Creating session with parent: ${toolContext.sessionID}`)
      const createResult = await ctx.client.session.create({
        body: {
          parentID: toolContext.sessionID,
          title: `look_at: ${args.goal.substring(0, 50)}`,
        },
      })
      if (createResult.error) {
        log(`[look_at] Session create error:`, createResult.error)
        return `Error: Failed to create session: ${createResult.error}`
      }
      const sessionID = createResult.data.id
      log(`[look_at] Created session: ${sessionID}`)
      log(`[look_at] Sending prompt to session ${sessionID}`)
      await ctx.client.session.prompt({
        path: { id: sessionID },
        body: {
          agent: MULTIMODAL_LOOKER_AGENT,
          tools: {
            task: false,
            call_omo_agent: false,
            look_at: false,
          },
          parts: [{ type: "text", text: prompt }],
        },
      })
      log(`[look_at] Prompt sent, fetching messages...`)
      const messagesResult = await ctx.client.session.messages({
        path: { id: sessionID },
      })
      if (messagesResult.error) {
        log(`[look_at] Messages error:`, messagesResult.error)
        return `Error: Failed to get messages: ${messagesResult.error}`
      }
      const messages = messagesResult.data
      log(`[look_at] Got ${messages.length} messages`)
      // eslint-disable-next-line @typescript-eslint/no-explicit-any
      const lastAssistantMessage = messages
        .filter((m: any) => m.info.role === "assistant")
        .sort((a: any, b: any) => (b.info.time?.created || 0) - (a.info.time?.created || 0))[0]
      if (!lastAssistantMessage) {
        log(`[look_at] No assistant message found`)
        return `Error: No response from multimodal-looker agent`
      }
      log(`[look_at] Found assistant message with ${lastAssistantMessage.parts.length} parts`)
      // eslint-disable-next-line @typescript-eslint/no-explicit-any
      const textParts = lastAssistantMessage.parts.filter((p: any) => p.type === "text")
      // eslint-disable-next-line @typescript-eslint/no-explicit-any
      const responseText = textParts.map((p: any) => p.text).join("\n")
      log(`[look_at] Got response, length: ${responseText.length}`)
      return responseText
    },
  })
 }
--- a/src/tools/look-at/types.ts
+++ b/src/tools/look-at/types.ts
@@ -0,0 +1,4 @@
 export interface LookAtArgs {
  file_path: string
  goal: string
 }