chore: enhance .gitignore and remove obsolete documentation files - Reorganize .gitignore with categorized sections for better maintainability - Add comprehensive ignore patterns for Python, Node.js, databases, logs, and build artifacts - Add project-specific ignore rules for coordinator, explorer, and deployment files - Remove outdated documentation: BITCOIN-WALLET-SETUP.md, LOCAL_ASSETS_SUMMARY.md, README-CONTAINER-DEPLOYMENT.md, README-DOMAIN-DEPLOYMENT.md ```
280 lines
8.8 KiB
Python
Executable File
280 lines
8.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
AITBC Ollama Plugin Service - Provides GPU-powered LLM inference via Ollama
|
|
"""
|
|
|
|
import asyncio
|
|
import httpx
|
|
import logging
|
|
from datetime import datetime
|
|
from typing import Dict, Any, Optional
|
|
import json
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class OllamaPlugin:
|
|
"""Ollama plugin for AITBC - provides LLM inference services"""
|
|
|
|
def __init__(self, ollama_url: str = "http://localhost:11434"):
|
|
self.ollama_url = ollama_url
|
|
self.client = httpx.AsyncClient(timeout=60.0)
|
|
self.models_cache = None
|
|
self.last_cache_update = None
|
|
|
|
async def get_models(self) -> list:
|
|
"""Get available models from Ollama"""
|
|
try:
|
|
response = await self.client.get(f"{self.ollama_url}/api/tags")
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
return data.get("models", [])
|
|
return []
|
|
except Exception as e:
|
|
logger.error(f"Failed to get models: {e}")
|
|
return []
|
|
|
|
async def generate(
|
|
self,
|
|
model: str,
|
|
prompt: str,
|
|
system_prompt: Optional[str] = None,
|
|
temperature: float = 0.7,
|
|
max_tokens: Optional[int] = None
|
|
) -> Dict[str, Any]:
|
|
"""Generate text using Ollama model"""
|
|
|
|
request_data = {
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": temperature
|
|
}
|
|
}
|
|
|
|
if system_prompt:
|
|
request_data["system"] = system_prompt
|
|
|
|
if max_tokens:
|
|
request_data["options"]["num_predict"] = max_tokens
|
|
|
|
try:
|
|
logger.info(f"Generating with model: {model}")
|
|
start_time = datetime.now()
|
|
|
|
response = await self.client.post(
|
|
f"{self.ollama_url}/api/generate",
|
|
json=request_data
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
end_time = datetime.now()
|
|
duration = (end_time - start_time).total_seconds()
|
|
|
|
return {
|
|
"success": True,
|
|
"text": result.get("response", ""),
|
|
"model": model,
|
|
"prompt_tokens": result.get("prompt_eval_count", 0),
|
|
"completion_tokens": result.get("eval_count", 0),
|
|
"total_tokens": result.get("prompt_eval_count", 0) + result.get("eval_count", 0),
|
|
"duration_seconds": duration,
|
|
"done": result.get("done", False)
|
|
}
|
|
else:
|
|
return {
|
|
"success": False,
|
|
"error": f"Ollama error: {response.status_code}",
|
|
"details": response.text
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Generation failed: {e}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e)
|
|
}
|
|
|
|
async def chat(
|
|
self,
|
|
model: str,
|
|
messages: list,
|
|
temperature: float = 0.7,
|
|
max_tokens: Optional[int] = None
|
|
) -> Dict[str, Any]:
|
|
"""Chat with Ollama model"""
|
|
|
|
request_data = {
|
|
"model": model,
|
|
"messages": messages,
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": temperature
|
|
}
|
|
}
|
|
|
|
if max_tokens:
|
|
request_data["options"]["num_predict"] = max_tokens
|
|
|
|
try:
|
|
logger.info(f"Chat with model: {model}")
|
|
start_time = datetime.now()
|
|
|
|
response = await self.client.post(
|
|
f"{self.ollama_url}/api/chat",
|
|
json=request_data
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
end_time = datetime.now()
|
|
duration = (end_time - start_time).total_seconds()
|
|
|
|
return {
|
|
"success": True,
|
|
"message": result.get("message", {}),
|
|
"model": model,
|
|
"prompt_tokens": result.get("prompt_eval_count", 0),
|
|
"completion_tokens": result.get("eval_count", 0),
|
|
"total_tokens": result.get("prompt_eval_count", 0) + result.get("eval_count", 0),
|
|
"duration_seconds": duration,
|
|
"done": result.get("done", False)
|
|
}
|
|
else:
|
|
return {
|
|
"success": False,
|
|
"error": f"Ollama error: {response.status_code}",
|
|
"details": response.text
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Chat failed: {e}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e)
|
|
}
|
|
|
|
async def get_model_info(self, model: str) -> Dict[str, Any]:
|
|
"""Get detailed information about a model"""
|
|
try:
|
|
response = await self.client.post(
|
|
f"{self.ollama_url}/api/show",
|
|
json={"name": model}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
return {}
|
|
except Exception as e:
|
|
logger.error(f"Failed to get model info: {e}")
|
|
return {}
|
|
|
|
def calculate_cost(self, model: str, tokens: int) -> float:
|
|
"""Calculate cost for inference based on model and tokens"""
|
|
# Pricing per 1M tokens (adjust based on your pricing model)
|
|
pricing = {
|
|
"deepseek-r1:14b": 0.14,
|
|
"qwen2.5-coder:14b": 0.12,
|
|
"deepseek-coder-v2:latest": 0.12,
|
|
"gemma3:12b": 0.10,
|
|
"deepcoder:latest": 0.08,
|
|
"deepseek-coder:6.7b-base": 0.06,
|
|
"llama3.2:3b-instruct-q8_0": 0.04,
|
|
"mistral:latest": 0.04,
|
|
"llama3.2:latest": 0.02,
|
|
"gemma3:4b": 0.02,
|
|
"qwen2.5:1.5b": 0.01,
|
|
"gemma3:1b": 0.01,
|
|
"lauchacarro/qwen2.5-translator:latest": 0.01
|
|
}
|
|
|
|
price_per_million = pricing.get(model, 0.05) # Default price
|
|
cost = (tokens / 1_000_000) * price_per_million
|
|
return round(cost, 6)
|
|
|
|
# Service instance
|
|
ollama_service = OllamaPlugin()
|
|
|
|
# AITBC Plugin Interface
|
|
async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Handle AITBC plugin requests"""
|
|
|
|
action = request.get("action")
|
|
|
|
if action == "list_models":
|
|
models = await ollama_service.get_models()
|
|
return {
|
|
"success": True,
|
|
"models": [{"name": m["name"], "size": m["size"]} for m in models]
|
|
}
|
|
|
|
elif action == "generate":
|
|
result = await ollama_service.generate(
|
|
model=request.get("model"),
|
|
prompt=request.get("prompt"),
|
|
system_prompt=request.get("system_prompt"),
|
|
temperature=request.get("temperature", 0.7),
|
|
max_tokens=request.get("max_tokens")
|
|
)
|
|
|
|
if result["success"]:
|
|
# Add cost calculation
|
|
result["cost"] = ollama_service.calculate_cost(
|
|
result["model"],
|
|
result["total_tokens"]
|
|
)
|
|
|
|
return result
|
|
|
|
elif action == "chat":
|
|
result = await ollama_service.chat(
|
|
model=request.get("model"),
|
|
messages=request.get("messages"),
|
|
temperature=request.get("temperature", 0.7),
|
|
max_tokens=request.get("max_tokens")
|
|
)
|
|
|
|
if result["success"]:
|
|
# Add cost calculation
|
|
result["cost"] = ollama_service.calculate_cost(
|
|
result["model"],
|
|
result["total_tokens"]
|
|
)
|
|
|
|
return result
|
|
|
|
elif action == "model_info":
|
|
model = request.get("model")
|
|
info = await ollama_service.get_model_info(model)
|
|
return {
|
|
"success": True,
|
|
"info": info
|
|
}
|
|
|
|
else:
|
|
return {
|
|
"success": False,
|
|
"error": f"Unknown action: {action}"
|
|
}
|
|
|
|
if __name__ == "__main__":
|
|
# Test the service
|
|
async def test():
|
|
# List models
|
|
models = await ollama_service.get_models()
|
|
print(f"Available models: {len(models)}")
|
|
|
|
# Test generation
|
|
if models:
|
|
result = await ollama_service.generate(
|
|
model=models[0]["name"],
|
|
prompt="What is AITBC?",
|
|
max_tokens=100
|
|
)
|
|
print(f"Generation result: {result}")
|
|
|
|
asyncio.run(test())
|