feat: add look_at tool and multimodal-looker agent

Add a new tool and agent for analyzing media files (PDFs, images, diagrams)
that require visual interpretation beyond raw text.

- Add `multimodal-looker` agent using Gemini 2.5 Flash model
- Add `look_at` tool that spawns multimodal-looker sessions
- Restrict multimodal-looker from calling task/call_omo_agent/look_at tools

Inspired by Sourcegraph Ampcode's look_at tool design.

🤖 GENERATED WITH ASSISTANCE OF [OhMyOpenCode](https://github.com/code-yeongyu/oh-my-opencode)
This commit is contained in:
YeonGyu-Kim
2025-12-13 15:25:29 +09:00
parent 821b0b8e9f
commit a3938e8c25
10 changed files with 180 additions and 1 deletions

View File

@@ -4,6 +4,7 @@ import { librarianAgent } from "./librarian"
import { exploreAgent } from "./explore"
import { frontendUiUxEngineerAgent } from "./frontend-ui-ux-engineer"
import { documentWriterAgent } from "./document-writer"
import { multimodalLookerAgent } from "./multimodal-looker"
export const builtinAgents: Record<string, AgentConfig> = {
oracle: oracleAgent,
@@ -11,6 +12,7 @@ export const builtinAgents: Record<string, AgentConfig> = {
explore: exploreAgent,
"frontend-ui-ux-engineer": frontendUiUxEngineerAgent,
"document-writer": documentWriterAgent,
"multimodal-looker": multimodalLookerAgent,
}
export * from "./types"

View File

@@ -0,0 +1,42 @@
import type { AgentConfig } from "@opencode-ai/sdk"
export const multimodalLookerAgent: AgentConfig = {
description:
"Analyze media files (PDFs, images, diagrams) that require interpretation beyond raw text. Extracts specific information or summaries from documents, describes visual content. Use when you need analyzed/extracted data rather than literal file contents.",
mode: "subagent",
model: "google/gemini-2.5-flash",
temperature: 0.1,
tools: { Read: true },
prompt: `You interpret media files that cannot be read as plain text.
Your job: examine the attached file and extract ONLY what was requested.
When to use you:
- Media files the Read tool cannot interpret
- Extracting specific information or summaries from documents
- Describing visual content in images or diagrams
- When analyzed/extracted data is needed, not raw file contents
When NOT to use you:
- Source code or plain text files needing exact contents (use Read)
- Files that need editing afterward (need literal content from Read)
- Simple file reading where no interpretation is needed
How you work:
1. Receive a file path and a goal describing what to extract
2. Read and analyze the file deeply
3. Return ONLY the relevant extracted information
4. The main agent never processes the raw file - you save context tokens
For PDFs: extract text, structure, tables, data from specific sections
For images: describe layouts, UI elements, text, diagrams, charts
For diagrams: explain relationships, flows, architecture depicted
Response rules:
- Return extracted information directly, no preamble
- If info not found, state clearly what's missing
- Match the language of the request
- Be thorough on the goal, concise on everything else
Your output goes straight to the main agent for continued work.`,
}

View File

@@ -6,6 +6,7 @@ export type AgentName =
| "explore"
| "frontend-ui-ux-engineer"
| "document-writer"
| "multimodal-looker"
export type AgentOverrideConfig = Partial<AgentConfig>

View File

@@ -5,6 +5,7 @@ import { librarianAgent } from "./librarian"
import { exploreAgent } from "./explore"
import { frontendUiUxEngineerAgent } from "./frontend-ui-ux-engineer"
import { documentWriterAgent } from "./document-writer"
import { multimodalLookerAgent } from "./multimodal-looker"
import { deepMerge } from "../shared"
const allBuiltinAgents: Record<AgentName, AgentConfig> = {
@@ -13,6 +14,7 @@ const allBuiltinAgents: Record<AgentName, AgentConfig> = {
explore: exploreAgent,
"frontend-ui-ux-engineer": frontendUiUxEngineerAgent,
"document-writer": documentWriterAgent,
"multimodal-looker": multimodalLookerAgent,
}
function mergeAgentConfig(