Files
aitbc/cli/aitbc_cli/commands/multimodal.py
oib 825f157749 Update Python version requirements and fix compatibility issues
- Bump minimum Python version from 3.11 to 3.13 across all apps
- Add Python 3.11-3.13 test matrix to CLI workflow
- Document Python 3.11+ requirement in .env.example
- Fix Starlette Broadcast removal with in-process fallback implementation
- Add _InProcessBroadcast class for tests when Starlette Broadcast is unavailable
- Refactor API key validators to read live settings instead of cached values
- Update database models with explicit
2026-02-24 18:41:08 +01:00

471 lines
16 KiB
Python

"""Multi-modal processing commands for AITBC CLI"""
import click
import httpx
import json
import base64
import mimetypes
from typing import Optional, Dict, Any, List
from pathlib import Path
from ..utils import output, error, success, warning
@click.group()
def multimodal():
"""Multi-modal agent processing and cross-modal operations"""
pass
@multimodal.command()
@click.option("--name", required=True, help="Multi-modal agent name")
@click.option("--modalities", required=True, help="Comma-separated modalities (text,image,audio,video)")
@click.option("--description", default="", help="Agent description")
@click.option("--model-config", type=click.File('r'), help="Model configuration JSON file")
@click.option("--gpu-acceleration", is_flag=True, help="Enable GPU acceleration")
@click.pass_context
def agent(ctx, name: str, modalities: str, description: str, model_config, gpu_acceleration: bool):
"""Create multi-modal agent"""
config = ctx.obj['config']
modality_list = [mod.strip() for mod in modalities.split(',')]
agent_data = {
"name": name,
"description": description,
"modalities": modality_list,
"gpu_acceleration": gpu_acceleration,
"agent_type": "multimodal"
}
if model_config:
try:
config_data = json.load(model_config)
agent_data["model_config"] = config_data
except Exception as e:
error(f"Failed to read model config file: {e}")
return
try:
with httpx.Client() as client:
response = client.post(
f"{config.coordinator_url}/v1/multimodal/agents",
headers={"X-Api-Key": config.api_key or ""},
json=agent_data
)
if response.status_code == 201:
agent = response.json()
success(f"Multi-modal agent created: {agent['id']}")
output(agent, ctx.obj['output_format'])
else:
error(f"Failed to create multi-modal agent: {response.status_code}")
if response.text:
error(response.text)
ctx.exit(1)
except Exception as e:
error(f"Network error: {e}")
ctx.exit(1)
@multimodal.command()
@click.argument("agent_id")
@click.option("--text", help="Text input")
@click.option("--image", type=click.Path(exists=True), help="Image file path")
@click.option("--audio", type=click.Path(exists=True), help="Audio file path")
@click.option("--video", type=click.Path(exists=True), help="Video file path")
@click.option("--output-format", default="json", type=click.Choice(["json", "text", "binary"]),
help="Output format for results")
@click.pass_context
def process(ctx, agent_id: str, text: Optional[str], image: Optional[str],
audio: Optional[str], video: Optional[str], output_format: str):
"""Process multi-modal inputs with agent"""
config = ctx.obj['config']
# Prepare multi-modal data
modal_data = {}
if text:
modal_data["text"] = text
if image:
try:
with open(image, 'rb') as f:
image_data = f.read()
modal_data["image"] = {
"data": base64.b64encode(image_data).decode(),
"mime_type": mimetypes.guess_type(image)[0] or "image/jpeg",
"filename": Path(image).name
}
except Exception as e:
error(f"Failed to read image file: {e}")
return
if audio:
try:
with open(audio, 'rb') as f:
audio_data = f.read()
modal_data["audio"] = {
"data": base64.b64encode(audio_data).decode(),
"mime_type": mimetypes.guess_type(audio)[0] or "audio/wav",
"filename": Path(audio).name
}
except Exception as e:
error(f"Failed to read audio file: {e}")
return
if video:
try:
with open(video, 'rb') as f:
video_data = f.read()
modal_data["video"] = {
"data": base64.b64encode(video_data).decode(),
"mime_type": mimetypes.guess_type(video)[0] or "video/mp4",
"filename": Path(video).name
}
except Exception as e:
error(f"Failed to read video file: {e}")
return
if not modal_data:
error("At least one modality input must be provided")
return
process_data = {
"modalities": modal_data,
"output_format": output_format
}
try:
with httpx.Client() as client:
response = client.post(
f"{config.coordinator_url}/v1/multimodal/agents/{agent_id}/process",
headers={"X-Api-Key": config.api_key or ""},
json=process_data
)
if response.status_code == 200:
result = response.json()
success("Multi-modal processing completed")
output(result, ctx.obj['output_format'])
else:
error(f"Failed to process multi-modal inputs: {response.status_code}")
if response.text:
error(response.text)
ctx.exit(1)
except Exception as e:
error(f"Network error: {e}")
ctx.exit(1)
@multimodal.command()
@click.argument("agent_id")
@click.option("--dataset", default="coco_vqa", help="Dataset name for benchmarking")
@click.option("--metrics", default="accuracy,latency", help="Comma-separated metrics to evaluate")
@click.option("--iterations", default=100, help="Number of benchmark iterations")
@click.pass_context
def benchmark(ctx, agent_id: str, dataset: str, metrics: str, iterations: int):
"""Benchmark multi-modal agent performance"""
config = ctx.obj['config']
benchmark_data = {
"dataset": dataset,
"metrics": [m.strip() for m in metrics.split(',')],
"iterations": iterations
}
try:
with httpx.Client() as client:
response = client.post(
f"{config.coordinator_url}/v1/multimodal/agents/{agent_id}/benchmark",
headers={"X-Api-Key": config.api_key or ""},
json=benchmark_data
)
if response.status_code == 202:
benchmark = response.json()
success(f"Benchmark started: {benchmark['id']}")
output(benchmark, ctx.obj['output_format'])
else:
error(f"Failed to start benchmark: {response.status_code}")
if response.text:
error(response.text)
ctx.exit(1)
except Exception as e:
error(f"Network error: {e}")
ctx.exit(1)
@multimodal.command()
@click.argument("agent_id")
@click.option("--objective", default="throughput",
type=click.Choice(["throughput", "latency", "accuracy", "efficiency"]),
help="Optimization objective")
@click.option("--target", help="Target value for optimization")
@click.pass_context
def optimize(ctx, agent_id: str, objective: str, target: Optional[str]):
"""Optimize multi-modal agent pipeline"""
config = ctx.obj['config']
optimization_data = {"objective": objective}
if target:
optimization_data["target"] = target
try:
with httpx.Client() as client:
response = client.post(
f"{config.coordinator_url}/v1/multimodal/agents/{agent_id}/optimize",
headers={"X-Api-Key": config.api_key or ""},
json=optimization_data
)
if response.status_code == 200:
result = response.json()
success(f"Multi-modal optimization completed")
output(result, ctx.obj['output_format'])
else:
error(f"Failed to optimize agent: {response.status_code}")
if response.text:
error(response.text)
ctx.exit(1)
except Exception as e:
error(f"Network error: {e}")
ctx.exit(1)
@click.group()
def convert():
"""Cross-modal conversion operations"""
pass
multimodal.add_command(convert)
@convert.command()
@click.option("--input", "input_path", required=True, type=click.Path(exists=True), help="Input file path")
@click.option("--output", "output_format", required=True,
type=click.Choice(["text", "image", "audio", "video"]),
help="Output modality")
@click.option("--model", default="blip", help="Conversion model to use")
@click.option("--output-file", type=click.Path(), help="Output file path")
@click.pass_context
def convert(ctx, input_path: str, output_format: str, model: str, output_file: Optional[str]):
"""Convert between modalities"""
config = ctx.obj['config']
# Read input file
try:
with open(input_path, 'rb') as f:
input_data = f.read()
except Exception as e:
error(f"Failed to read input file: {e}")
return
conversion_data = {
"input": {
"data": base64.b64encode(input_data).decode(),
"mime_type": mimetypes.guess_type(input_path)[0] or "application/octet-stream",
"filename": Path(input_path).name
},
"output_modality": output_format,
"model": model
}
try:
with httpx.Client() as client:
response = client.post(
f"{config.coordinator_url}/v1/multimodal/convert",
headers={"X-Api-Key": config.api_key or ""},
json=conversion_data
)
if response.status_code == 200:
result = response.json()
if output_file and result.get("output_data"):
# Decode and save output
output_data = base64.b64decode(result["output_data"])
with open(output_file, 'wb') as f:
f.write(output_data)
success(f"Conversion output saved to {output_file}")
else:
output(result, ctx.obj['output_format'])
else:
error(f"Failed to convert modality: {response.status_code}")
if response.text:
error(response.text)
ctx.exit(1)
except Exception as e:
error(f"Network error: {e}")
ctx.exit(1)
@click.group()
def search():
"""Multi-modal search operations"""
pass
multimodal.add_command(search)
@search.command()
@click.argument("query")
@click.option("--modalities", default="image,text", help="Comma-separated modalities to search")
@click.option("--limit", default=20, help="Number of results to return")
@click.option("--threshold", default=0.5, help="Similarity threshold")
@click.pass_context
def search(ctx, query: str, modalities: str, limit: int, threshold: float):
"""Multi-modal search across different modalities"""
config = ctx.obj['config']
search_data = {
"query": query,
"modalities": [m.strip() for m in modalities.split(',')],
"limit": limit,
"threshold": threshold
}
try:
with httpx.Client() as client:
response = client.post(
f"{config.coordinator_url}/v1/multimodal/search",
headers={"X-Api-Key": config.api_key or ""},
json=search_data
)
if response.status_code == 200:
results = response.json()
output(results, ctx.obj['output_format'])
else:
error(f"Failed to perform multi-modal search: {response.status_code}")
ctx.exit(1)
except Exception as e:
error(f"Network error: {e}")
ctx.exit(1)
@click.group()
def attention():
"""Cross-modal attention analysis"""
pass
multimodal.add_command(attention)
@attention.command()
@click.argument("agent_id")
@click.option("--inputs", type=click.File('r'), required=True, help="Multi-modal inputs JSON file")
@click.option("--visualize", is_flag=True, help="Generate attention visualization")
@click.option("--output", type=click.Path(), help="Output file for visualization")
@click.pass_context
def attention(ctx, agent_id: str, inputs, visualize: bool, output: Optional[str]):
"""Analyze cross-modal attention patterns"""
config = ctx.obj['config']
try:
inputs_data = json.load(inputs)
except Exception as e:
error(f"Failed to read inputs file: {e}")
return
attention_data = {
"inputs": inputs_data,
"visualize": visualize
}
try:
with httpx.Client() as client:
response = client.post(
f"{config.coordinator_url}/v1/multimodal/agents/{agent_id}/attention",
headers={"X-Api-Key": config.api_key or ""},
json=attention_data
)
if response.status_code == 200:
result = response.json()
if visualize and output and result.get("visualization"):
# Save visualization
viz_data = base64.b64decode(result["visualization"])
with open(output, 'wb') as f:
f.write(viz_data)
success(f"Attention visualization saved to {output}")
else:
output(result, ctx.obj['output_format'])
else:
error(f"Failed to analyze attention: {response.status_code}")
if response.text:
error(response.text)
ctx.exit(1)
except Exception as e:
error(f"Network error: {e}")
ctx.exit(1)
@multimodal.command()
@click.argument("agent_id")
@click.pass_context
def capabilities(ctx, agent_id: str):
"""List multi-modal agent capabilities"""
config = ctx.obj['config']
try:
with httpx.Client() as client:
response = client.get(
f"{config.coordinator_url}/v1/multimodal/agents/{agent_id}/capabilities",
headers={"X-Api-Key": config.api_key or ""}
)
if response.status_code == 200:
capabilities = response.json()
output(capabilities, ctx.obj['output_format'])
else:
error(f"Failed to get agent capabilities: {response.status_code}")
ctx.exit(1)
except Exception as e:
error(f"Network error: {e}")
ctx.exit(1)
@multimodal.command()
@click.argument("agent_id")
@click.option("--modality", required=True,
type=click.Choice(["text", "image", "audio", "video"]),
help="Modality to test")
@click.option("--test-data", type=click.File('r'), help="Test data JSON file")
@click.pass_context
def test(ctx, agent_id: str, modality: str, test_data):
"""Test individual modality processing"""
config = ctx.obj['config']
test_input = {}
if test_data:
try:
test_input = json.load(test_data)
except Exception as e:
error(f"Failed to read test data file: {e}")
return
try:
with httpx.Client() as client:
response = client.post(
f"{config.coordinator_url}/v1/multimodal/agents/{agent_id}/test/{modality}",
headers={"X-Api-Key": config.api_key or ""},
json=test_input
)
if response.status_code == 200:
result = response.json()
success(f"Modality test completed for {modality}")
output(result, ctx.obj['output_format'])
else:
error(f"Failed to test modality: {response.status_code}")
if response.text:
error(response.text)
ctx.exit(1)
except Exception as e:
error(f"Network error: {e}")
ctx.exit(1)