Files
aitbc/plugins/ollama/service.py
oib 9b9c5beb23 ```
chore: enhance .gitignore and remove obsolete documentation files

- Reorganize .gitignore with categorized sections for better maintainability
- Add comprehensive ignore patterns for Python, Node.js, databases, logs, and build artifacts
- Add project-specific ignore rules for coordinator, explorer, and deployment files
- Remove outdated documentation: BITCOIN-WALLET-SETUP.md, LOCAL_ASSETS_SUMMARY.md, README-CONTAINER-DEPLOYMENT.md, README-DOMAIN-DEPLOYMENT.md
```
2026-01-24 14:44:51 +01:00

280 lines
8.8 KiB
Python
Executable File

#!/usr/bin/env python3
"""
AITBC Ollama Plugin Service - Provides GPU-powered LLM inference via Ollama
"""
import asyncio
import httpx
import logging
from datetime import datetime
from typing import Dict, Any, Optional
import json
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class OllamaPlugin:
"""Ollama plugin for AITBC - provides LLM inference services"""
def __init__(self, ollama_url: str = "http://localhost:11434"):
self.ollama_url = ollama_url
self.client = httpx.AsyncClient(timeout=60.0)
self.models_cache = None
self.last_cache_update = None
async def get_models(self) -> list:
"""Get available models from Ollama"""
try:
response = await self.client.get(f"{self.ollama_url}/api/tags")
if response.status_code == 200:
data = response.json()
return data.get("models", [])
return []
except Exception as e:
logger.error(f"Failed to get models: {e}")
return []
async def generate(
self,
model: str,
prompt: str,
system_prompt: Optional[str] = None,
temperature: float = 0.7,
max_tokens: Optional[int] = None
) -> Dict[str, Any]:
"""Generate text using Ollama model"""
request_data = {
"model": model,
"prompt": prompt,
"stream": False,
"options": {
"temperature": temperature
}
}
if system_prompt:
request_data["system"] = system_prompt
if max_tokens:
request_data["options"]["num_predict"] = max_tokens
try:
logger.info(f"Generating with model: {model}")
start_time = datetime.now()
response = await self.client.post(
f"{self.ollama_url}/api/generate",
json=request_data
)
if response.status_code == 200:
result = response.json()
end_time = datetime.now()
duration = (end_time - start_time).total_seconds()
return {
"success": True,
"text": result.get("response", ""),
"model": model,
"prompt_tokens": result.get("prompt_eval_count", 0),
"completion_tokens": result.get("eval_count", 0),
"total_tokens": result.get("prompt_eval_count", 0) + result.get("eval_count", 0),
"duration_seconds": duration,
"done": result.get("done", False)
}
else:
return {
"success": False,
"error": f"Ollama error: {response.status_code}",
"details": response.text
}
except Exception as e:
logger.error(f"Generation failed: {e}")
return {
"success": False,
"error": str(e)
}
async def chat(
self,
model: str,
messages: list,
temperature: float = 0.7,
max_tokens: Optional[int] = None
) -> Dict[str, Any]:
"""Chat with Ollama model"""
request_data = {
"model": model,
"messages": messages,
"stream": False,
"options": {
"temperature": temperature
}
}
if max_tokens:
request_data["options"]["num_predict"] = max_tokens
try:
logger.info(f"Chat with model: {model}")
start_time = datetime.now()
response = await self.client.post(
f"{self.ollama_url}/api/chat",
json=request_data
)
if response.status_code == 200:
result = response.json()
end_time = datetime.now()
duration = (end_time - start_time).total_seconds()
return {
"success": True,
"message": result.get("message", {}),
"model": model,
"prompt_tokens": result.get("prompt_eval_count", 0),
"completion_tokens": result.get("eval_count", 0),
"total_tokens": result.get("prompt_eval_count", 0) + result.get("eval_count", 0),
"duration_seconds": duration,
"done": result.get("done", False)
}
else:
return {
"success": False,
"error": f"Ollama error: {response.status_code}",
"details": response.text
}
except Exception as e:
logger.error(f"Chat failed: {e}")
return {
"success": False,
"error": str(e)
}
async def get_model_info(self, model: str) -> Dict[str, Any]:
"""Get detailed information about a model"""
try:
response = await self.client.post(
f"{self.ollama_url}/api/show",
json={"name": model}
)
if response.status_code == 200:
return response.json()
return {}
except Exception as e:
logger.error(f"Failed to get model info: {e}")
return {}
def calculate_cost(self, model: str, tokens: int) -> float:
"""Calculate cost for inference based on model and tokens"""
# Pricing per 1M tokens (adjust based on your pricing model)
pricing = {
"deepseek-r1:14b": 0.14,
"qwen2.5-coder:14b": 0.12,
"deepseek-coder-v2:latest": 0.12,
"gemma3:12b": 0.10,
"deepcoder:latest": 0.08,
"deepseek-coder:6.7b-base": 0.06,
"llama3.2:3b-instruct-q8_0": 0.04,
"mistral:latest": 0.04,
"llama3.2:latest": 0.02,
"gemma3:4b": 0.02,
"qwen2.5:1.5b": 0.01,
"gemma3:1b": 0.01,
"lauchacarro/qwen2.5-translator:latest": 0.01
}
price_per_million = pricing.get(model, 0.05) # Default price
cost = (tokens / 1_000_000) * price_per_million
return round(cost, 6)
# Service instance
ollama_service = OllamaPlugin()
# AITBC Plugin Interface
async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]:
"""Handle AITBC plugin requests"""
action = request.get("action")
if action == "list_models":
models = await ollama_service.get_models()
return {
"success": True,
"models": [{"name": m["name"], "size": m["size"]} for m in models]
}
elif action == "generate":
result = await ollama_service.generate(
model=request.get("model"),
prompt=request.get("prompt"),
system_prompt=request.get("system_prompt"),
temperature=request.get("temperature", 0.7),
max_tokens=request.get("max_tokens")
)
if result["success"]:
# Add cost calculation
result["cost"] = ollama_service.calculate_cost(
result["model"],
result["total_tokens"]
)
return result
elif action == "chat":
result = await ollama_service.chat(
model=request.get("model"),
messages=request.get("messages"),
temperature=request.get("temperature", 0.7),
max_tokens=request.get("max_tokens")
)
if result["success"]:
# Add cost calculation
result["cost"] = ollama_service.calculate_cost(
result["model"],
result["total_tokens"]
)
return result
elif action == "model_info":
model = request.get("model")
info = await ollama_service.get_model_info(model)
return {
"success": True,
"info": info
}
else:
return {
"success": False,
"error": f"Unknown action: {action}"
}
if __name__ == "__main__":
# Test the service
async def test():
# List models
models = await ollama_service.get_models()
print(f"Available models: {len(models)}")
# Test generation
if models:
result = await ollama_service.generate(
model=models[0]["name"],
prompt="What is AITBC?",
max_tokens=100
)
print(f"Generation result: {result}")
asyncio.run(test())