feat: add look_at tool and multimodal-looker agent

Add a new tool and agent for analyzing media files (PDFs, images, diagrams) that require visual interpretation beyond raw text. - Add `multimodal-looker` agent using Gemini 2.5 Flash model - Add `look_at` tool that spawns multimodal-looker sessions - Restrict multimodal-looker from calling task/call_omo_agent/look_at tools Inspired by Sourcegraph Ampcode's look_at tool design. 🤖 GENERATED WITH ASSISTANCE OF [OhMyOpenCode](https://github.com/code-yeongyu/oh-my-opencode)
2025-12-13 15:25:29 +09:00
parent 821b0b8e9f
commit a3938e8c25
10 changed files with 180 additions and 1 deletions
--- a/src/tools/index.ts
+++ b/src/tools/index.ts
@@ -34,6 +34,7 @@ import type { BackgroundManager } from "../features/background-agent"
 type OpencodeClient = PluginInput["client"]

 export { createCallOmoAgent } from "./call-omo-agent"
+export { createLookAt } from "./look-at"

 export function createBackgroundTools(manager: BackgroundManager, client: OpencodeClient) {
  return {
--- a/src/tools/look-at/constants.ts
+++ b/src/tools/look-at/constants.ts
@@ -0,0 +1,23 @@
+export const MULTIMODAL_LOOKER_AGENT = "multimodal-looker" as const
+
+export const LOOK_AT_DESCRIPTION = `Analyze media files (PDFs, images, diagrams) that require visual interpretation.
+
+Use this tool to extract specific information from files that cannot be processed as plain text:
+- PDF documents: extract text, tables, structure, specific sections
+- Images: describe layouts, UI elements, text content, diagrams
+- Charts/Graphs: explain data, trends, relationships
+- Screenshots: identify UI components, text, visual elements
+- Architecture diagrams: explain flows, connections, components
+
+Parameters:
+- file_path: Absolute path to the file to analyze
+- goal: What specific information to extract (be specific for better results)
+
+Examples:
+- "Extract all API endpoints from this OpenAPI spec PDF"
+- "Describe the UI layout and components in this screenshot"
+- "Explain the data flow in this architecture diagram"
+- "List all table data from page 3 of this PDF"
+
+This tool uses a separate context window with Gemini 2.5 Flash for multimodal analysis,
+saving tokens in the main conversation while providing accurate visual interpretation.`
--- a/src/tools/look-at/index.ts
+++ b/src/tools/look-at/index.ts
@@ -0,0 +1,3 @@
+export * from "./types"
+export * from "./constants"
+export { createLookAt } from "./tools"
--- a/src/tools/look-at/tools.ts
+++ b/src/tools/look-at/tools.ts
@@ -0,0 +1,91 @@
+import { tool, type PluginInput } from "@opencode-ai/plugin"
+import { LOOK_AT_DESCRIPTION, MULTIMODAL_LOOKER_AGENT } from "./constants"
+import type { LookAtArgs } from "./types"
+import { log } from "../../shared/logger"
+
+export function createLookAt(ctx: PluginInput) {
+  return tool({
+    description: LOOK_AT_DESCRIPTION,
+    args: {
+      file_path: tool.schema.string().describe("Absolute path to the file to analyze"),
+      goal: tool.schema.string().describe("What specific information to extract from the file"),
+    },
+    async execute(args: LookAtArgs, toolContext) {
+      log(`[look_at] Analyzing file: ${args.file_path}, goal: ${args.goal}`)
+
+      const prompt = `Analyze this file and extract the requested information.
+
+File path: ${args.file_path}
+Goal: ${args.goal}
+
+Read the file using the Read tool, then provide ONLY the extracted information that matches the goal.
+Be thorough on what was requested, concise on everything else.
+If the requested information is not found, clearly state what is missing.`
+
+      log(`[look_at] Creating session with parent: ${toolContext.sessionID}`)
+      const createResult = await ctx.client.session.create({
+        body: {
+          parentID: toolContext.sessionID,
+          title: `look_at: ${args.goal.substring(0, 50)}`,
+        },
+      })
+
+      if (createResult.error) {
+        log(`[look_at] Session create error:`, createResult.error)
+        return `Error: Failed to create session: ${createResult.error}`
+      }
+
+      const sessionID = createResult.data.id
+      log(`[look_at] Created session: ${sessionID}`)
+
+      log(`[look_at] Sending prompt to session ${sessionID}`)
+      await ctx.client.session.prompt({
+        path: { id: sessionID },
+        body: {
+          agent: MULTIMODAL_LOOKER_AGENT,
+          tools: {
+            task: false,
+            call_omo_agent: false,
+            look_at: false,
+          },
+          parts: [{ type: "text", text: prompt }],
+        },
+      })
+
+      log(`[look_at] Prompt sent, fetching messages...`)
+
+      const messagesResult = await ctx.client.session.messages({
+        path: { id: sessionID },
+      })
+
+      if (messagesResult.error) {
+        log(`[look_at] Messages error:`, messagesResult.error)
+        return `Error: Failed to get messages: ${messagesResult.error}`
+      }
+
+      const messages = messagesResult.data
+      log(`[look_at] Got ${messages.length} messages`)
+
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      const lastAssistantMessage = messages
+        .filter((m: any) => m.info.role === "assistant")
+        .sort((a: any, b: any) => (b.info.time?.created || 0) - (a.info.time?.created || 0))[0]
+
+      if (!lastAssistantMessage) {
+        log(`[look_at] No assistant message found`)
+        return `Error: No response from multimodal-looker agent`
+      }
+
+      log(`[look_at] Found assistant message with ${lastAssistantMessage.parts.length} parts`)
+
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      const textParts = lastAssistantMessage.parts.filter((p: any) => p.type === "text")
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      const responseText = textParts.map((p: any) => p.text).join("\n")
+
+      log(`[look_at] Got response, length: ${responseText.length}`)
+
+      return responseText
+    },
+  })
+}
--- a/src/tools/look-at/types.ts
+++ b/src/tools/look-at/types.ts
@@ -0,0 +1,4 @@
+export interface LookAtArgs {
+  file_path: string
+  goal: string
+}