Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
4d58a47
support mcp
kyusonglee Mar 26, 2025
7419af3
update example
kyusonglee Mar 26, 2025
46ddbd3
update
kyusonglee Mar 26, 2025
545b7d3
update
kyusonglee Mar 26, 2025
8322c21
add mcp.json
kyusonglee Mar 26, 2025
ede4c25
use redis lite
qiandl2000 Apr 16, 2025
9b7e423
change for examples
qiandl2000 Apr 21, 2025
e563990
add vlm-r1-mcp
qiandl2000 May 6, 2025
923d193
Update test_mcp.py
qiandl2000 May 6, 2025
d0f8bb2
add both tool
qiandl2000 May 6, 2025
52769c9
Merge pull request #12 from qiandl2000/a4a_redislite
kyusonglee May 14, 2025
74a9eab
Merge pull request #13 from qiandl2000/mcp_vlm_r1
kyusonglee May 14, 2025
80aaf21
add sse for mcp and other changes in a4a
qiandl2000 May 19, 2025
be8428a
Merge branch 'develop/v0.2.5' into mcp_vlm_r1
qiandl2000 May 19, 2025
6018c83
Merge pull request #16 from qiandl2000/mcp_vlm_r1
kyusonglee May 19, 2025
c1113a3
add simple vlm for mcp test
kyusonglee May 19, 2025
3142cdc
fix mcp issues
kyusonglee May 22, 2025
9e0dca7
robot agent
kyusonglee May 29, 2025
d10c32c
add react based robot
kyusonglee May 29, 2025
172ea63
delete unnessarcy
kyusonglee May 29, 2025
2b9e8a7
add missing jsons
kyusonglee May 30, 2025
31f0ae2
update react based robot
kyusonglee May 30, 2025
50d9049
fix bug for vector db
kyusonglee May 30, 2025
6dabf49
upgrade mcp_tools and react agent
kyusonglee May 31, 2025
e100327
update
kyusonglee May 31, 2025
bc37475
remove logs
kyusonglee May 31, 2025
16da9bc
delete omagent4agentbased
kyusonglee Jun 4, 2025
1f715d6
modify ut_dog_server
kyusonglee Jun 4, 2025
ef05005
add robot
kyusonglee Jun 4, 2025
8e27e58
Chinese version with some extra changes
qiandl2000 Jun 11, 2025
26469c7
Merge pull request #17 from qiandl2000/vlm_example_1
kyusonglee Jun 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,8 @@ video_cache/
.vscode

# JSON files
*.json
!mcp.json
import os
memory_log.json
memory_summary.md
rgb_observation.jpg
Empty file added 1.txt
Empty file.
43 changes: 43 additions & 0 deletions docs/concepts/tool_system/mcp.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Model Control Protocol (MCP)

OmAgent's Model Control Protocol (MCP) system enables seamless integration with external AI models and services through a standardized interface. This protocol allows OmAgent to dynamically discover, register, and execute tools from multiple external servers, extending its capabilities without modifying the core codebase.

## MCP Configuration File

MCP servers are configured in a JSON file, typically named `mcp.json`. This file defines the servers that OmAgent can connect to. Each server has a unique name, command to execute, arguments, and environment variables.

Here's an example of a basic `mcp.json` file that configures multiple MCP servers:

```json
{
"mcpServers": {
"desktop-commander": {
"command": "npx",
"args": [
"-y",
"@smithery/cli@latest",
"run",
"@wonderwhy-er/desktop-commander",
"--key",
"your-api-key-here"
]
},
.....
}
```

By default, OmAgent looks for this file in the following locations (in order):
1. Inside the tool_system directory `omagent-cor/src/omagnet_core/tool_system/mcp.json`
it will be automatically loaded.

## Executing MCP Tools

MCP tools can be executed just like any other tool using the ToolManager:

```python
# Let the ToolManager choose the appropriate tool
x = tool_manager.execute_task("command ls -l for the current directory")
print (x)
```

For more details on creating MCP servers, refer to the [MCP specification](https://github.com/modelcontextprotocol/python-sdk).
308 changes: 308 additions & 0 deletions examples/mcp_example/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,308 @@
import os
import argparse
import sys
import tempfile
import shutil
import uuid
from typing import Optional, Dict, Any, List
import re
from urllib.parse import urlparse

# Ensure the latest fastmcp is installed -----------------------------------------------------------
try:
from fastmcp import FastMCP, Context
except ImportError:
print("fastmcp not found – installing from GitHub ...")
import subprocess
subprocess.check_call([
sys.executable,
"-m",
"pip",
"install",
"git+https://github.com/jlowin/fastmcp.git"
])
from fastmcp import FastMCP, Context

# Try to import requests for URL handling
try:
import requests
except ImportError:
print("requests not found – installing...")
import subprocess
subprocess.check_call([
sys.executable,
"-m",
"pip",
"install",
"requests"
])
import requests

# Import our VLM-R1 model ---------------------------------------------------------------------------
try:
from vlm_r1 import VLMR1
except ImportError as e:
print(f"Error importing VLMR1: {e}")
print("Make sure the src/vlm_r1.py file exists and all dependencies are installed.")
print("You may need to install additional packages, e.g.:\n pip install torch transformers pillow flash-attn bitsandbytes")
sys.exit(1)

# -----------------------------------------------------------------------------------------------
# Create the MCP server instance
mcp = FastMCP("VLM-R1 Server – fastmcp 2.x")

# Keep a global handle to the loaded model so that we only pay the load cost once
_model: Optional[VLMR1] = None

# Global temp directory for downloaded images
_temp_dir = None

def get_temp_dir():
"""Get or create a temporary directory for downloaded images."""
global _temp_dir
if _temp_dir is None or not os.path.exists(_temp_dir):
_temp_dir = tempfile.mkdtemp(prefix="vlm_r1_images_")
return _temp_dir

def is_url(path: str) -> bool:
"""Check if the given string is a URL."""
try:
result = urlparse(path)
return all([result.scheme, result.netloc])
except:
return False

def download_image(url: str) -> str:
"""Download an image from URL and return the local path."""
try:
response = requests.get(url, stream=True, timeout=10)
response.raise_for_status()

# Try to get the filename from the URL or generate a random one
content_type = response.headers.get('Content-Type', '')
if 'image' not in content_type:
raise ValueError(f"URL does not point to an image (content-type: {content_type})")

# Determine file extension
if 'image/jpeg' in content_type or 'image/jpg' in content_type:
ext = '.jpg'
elif 'image/png' in content_type:
ext = '.png'
elif 'image/gif' in content_type:
ext = '.gif'
elif 'image/webp' in content_type:
ext = '.webp'
elif 'image/bmp' in content_type:
ext = '.bmp'
else:
ext = '.jpg' # Default to jpg

# Create temporary file
temp_dir = get_temp_dir()
temp_path = os.path.join(temp_dir, f"{uuid.uuid4()}{ext}")

# Save the image
with open(temp_path, 'wb') as f:
shutil.copyfileobj(response.raw, f)

return temp_path
except Exception as e:
raise ValueError(f"Failed to download image from URL: {str(e)}")

def init_model(
model_path: str,
use_flash_attention: bool = True,
low_cpu_mem_usage: bool = True,
load_in_8bit: bool = False,
specific_device: Optional[str] = None,
):
"""Lazy-load the VLM-R1 model (only once per process)."""
global _model
if _model is None:
print(f"[fastmcp-server] Loading VLM-R1 from '{model_path}' … This can take a few minutes.")
# When we pin the device, we must keep low_cpu_mem_usage=True per transformers semantics
if specific_device is not None:
low_cpu_mem_usage = True

_model = VLMR1.load(
model_path=model_path,
use_flash_attention=use_flash_attention,
low_cpu_mem_usage=low_cpu_mem_usage,
load_in_8bit=load_in_8bit,
specific_device=specific_device,
)
print("[fastmcp-server] Model ready! 🚀")
return _model

# -------------------------------------------------------------------------------------------------
# RESOURCE: expose images so that remote clients can fetch binary data if they wish
@mcp.resource("image://{image_path}")
def image_resource(image_path: str) -> bytes: # noqa: D401
"""Return the raw bytes of *image_path* so that clients can embed / inspect it."""
if is_url(image_path):
try:
local_path = download_image(image_path)
with open(local_path, "rb") as fh:
return fh.read()
except Exception as e:
raise ValueError(f"Failed to fetch image from URL: {str(e)}")
else:
if not os.path.exists(image_path):
raise ValueError(f"Image not found at '{image_path}'.")
with open(image_path, "rb") as fh:
return fh.read()

# -------------------------------------------------------------------------------------------------
# TOOL: generic analyse image
@mcp.tool()
async def analyze_image(
image_path: str,
question: Optional[str] = None,
max_new_tokens: int = 1024,
max_image_size: int = 448,
resize_mode: str = "shorter",
ctx: Context | None = None,
) -> Dict[str, Any]:
"""Run the multimodal VLM-R1 model on *image_path*.

The *image_path* can be a local file path or a URL to an image.
The default *question* asks for a detailed description. A custom question can be supplied by
callers. The returned dict mirrors the output of :py:meth:`VLMR1.predict`.
"""
global _model
if _model is None:
_model = init_model(DEFAULT_MODEL_PATH)

if _model is None:
raise RuntimeError("Model not initialised – call init_model() first or start the server with --model-path …")

local_image_path = image_path

# If image_path is a URL, download it
if is_url(image_path):
try:
local_image_path = download_image(image_path)
except Exception as e:
return {"error": f"Failed to download image from URL: {str(e)}"}
elif not os.path.exists(image_path):
return {"error": f"Image not found: {image_path}"}

if question is None:
question = (
"Describe this image in detail. First output the thinking process in <think></think> tags "
"and then output the final answer in <answer></answer> tags."
)

# Run prediction directly
try:
result = _model.predict(
image_path=local_image_path,
question=question,
max_new_tokens=max_new_tokens,
max_image_size=max_image_size,
resize_mode=resize_mode,
)
return result
except Exception as e:
return {"error": f"Error during prediction: {str(e)}"}

# -------------------------------------------------------------------------------------------------
# TOOL: object detection helper
@mcp.tool()
async def detect_objects(
image_path: str,
max_new_tokens: int = 1024,
max_image_size: int = 448,
ctx: Context | None = None,
) -> Dict[str, Any]:
"""Detect objects in *image_path* using VLM-R1. The image_path can be a local file or URL."""
global _model
if _model is None:
_model = init_model(DEFAULT_MODEL_PATH)

if _model is None:
raise RuntimeError("Model not initialised – call init_model() first or start the server with --model-path …")

local_image_path = image_path

# If image_path is a URL, download it
if is_url(image_path):
try:
local_image_path = download_image(image_path)
except Exception as e:
return {"error": f"Failed to download image from URL: {str(e)}"}
elif not os.path.exists(image_path):
return {"error": f"Image not found: {image_path}"}

# Run prediction directly
try:
result = _model.predict(
image_path=local_image_path,
question="Detect all objects in this image. Provide bounding boxes if possible.",
max_new_tokens=max_new_tokens,
max_image_size=max_image_size,
resize_mode="shorter",
)
return result
except Exception as e:
return {"error": f"Error during prediction: {str(e)}"}

# -------------------------------------------------------------------------------------------------
# TOOL: list available images in a directory
@mcp.tool()
def list_images(directory: str = ".") -> List[str]:
"""Return a list of image files (by path) found in *directory*."""
exts = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"}
if not os.path.exists(directory):
return {"error": f"Directory not found: {directory}"}
return [os.path.join(directory, f) for f in os.listdir(directory) if os.path.splitext(f)[1].lower() in exts]

# -------------------------------------------------------------------------------------------------
# PROMPT helper – illustrates prompt templates
@mcp.prompt()
def image_analysis_prompt(image_path: str) -> str:
"""Generate a prompt to analyze an image (can be a local file or URL)."""
return (
"Please analyse the image at "
f"{image_path}. First describe what you see, then identify key objects or elements in the image."
)

# -------------------------------------------------------------------------------------------------
# Command-line interface so that users can run this file directly
DEFAULT_MODEL_PATH = "omlab/VLM-R1-Qwen2.5VL-3B-OVD-0321"

def _parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description="Run a VLM-R1 server powered by fastmcp 2.x")
p.add_argument("--model-path", default=DEFAULT_MODEL_PATH, help="HuggingFace repo or local checkpoint directory")
p.add_argument("--device", default="cuda:0", help="Device to run on (e.g. cuda:0 or cpu)")
p.add_argument("--use-flash-attention", action="store_true", help="Enable flash-attention kernels if available")
p.add_argument("--low-cpu-mem", action="store_true", help="Load with low CPU memory footprint")
p.add_argument("--load-in-8bit", action="store_true", help="Load in 8-bit precision")
return p.parse_args()


def main():
args = _parse_args()

# Pre-load model so that first request is fast (optional but helpful)
init_model(
model_path=args.model_path,
use_flash_attention=args.use_flash_attention,
low_cpu_mem_usage=args.low_cpu_mem,
load_in_8bit=args.load_in_8bit,
specific_device=args.device,
)

# Create temp directory for downloaded images
get_temp_dir()

try:
mcp.run(transport="sse", host="0.0.0.0", port=8008)
finally:
# Clean up temp directory on exit
if _temp_dir and os.path.exists(_temp_dir):
shutil.rmtree(_temp_dir, ignore_errors=True)


if __name__ == "__main__":
main()
Loading