feat: add marketplace metrics, privacy features, and service registry endpoints

- Add Prometheus metrics for marketplace API throughput and error rates with new dashboard panels
- Implement confidential transaction models with encryption support and access control
- Add key management system with registration, rotation, and audit logging
- Create services and registry routers for service discovery and management
- Integrate ZK proof generation for privacy-preserving receipts
- Add metrics instru
This commit is contained in:
oib
2025-12-22 10:33:23 +01:00
parent d98b2c7772
commit c8be9d7414
260 changed files with 59033 additions and 351 deletions

View File

@ -0,0 +1,321 @@
"""
LLM inference plugin
"""
import asyncio
from typing import Dict, Any, List, Optional
import time
from .base import GPUPlugin, PluginResult
from .exceptions import PluginExecutionError
class LLMPlugin(GPUPlugin):
"""Plugin for Large Language Model inference"""
def __init__(self):
super().__init__()
self.service_id = "llm_inference"
self.name = "LLM Inference"
self.version = "1.0.0"
self.description = "Run inference on large language models"
self.capabilities = ["generate", "stream", "chat"]
self._model_cache = {}
def setup(self) -> None:
"""Initialize LLM dependencies"""
super().setup()
# Check for transformers installation
try:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
self.transformers = AutoModelForCausalLM
self.AutoTokenizer = AutoTokenizer
self.pipeline = pipeline
except ImportError:
raise PluginExecutionError("Transformers not installed. Install with: pip install transformers accelerate")
# Check for torch
try:
import torch
self.torch = torch
except ImportError:
raise PluginExecutionError("PyTorch not installed. Install with: pip install torch")
def validate_request(self, request: Dict[str, Any]) -> List[str]:
"""Validate LLM request parameters"""
errors = []
# Check required parameters
if "prompt" not in request:
errors.append("'prompt' is required")
# Validate model
model = request.get("model", "llama-7b")
valid_models = [
"llama-7b",
"llama-13b",
"mistral-7b",
"mixtral-8x7b",
"gpt-3.5-turbo",
"gpt-4"
]
if model not in valid_models:
errors.append(f"Invalid model. Must be one of: {', '.join(valid_models)}")
# Validate max_tokens
max_tokens = request.get("max_tokens", 256)
if not isinstance(max_tokens, int) or max_tokens < 1 or max_tokens > 4096:
errors.append("max_tokens must be an integer between 1 and 4096")
# Validate temperature
temperature = request.get("temperature", 0.7)
if not isinstance(temperature, (int, float)) or temperature < 0.0 or temperature > 2.0:
errors.append("temperature must be between 0.0 and 2.0")
# Validate top_p
top_p = request.get("top_p")
if top_p is not None and (not isinstance(top_p, (int, float)) or top_p <= 0.0 or top_p > 1.0):
errors.append("top_p must be between 0.0 and 1.0")
return errors
def get_hardware_requirements(self) -> Dict[str, Any]:
"""Get hardware requirements for LLM inference"""
return {
"gpu": "recommended",
"vram_gb": 8,
"ram_gb": 16,
"cuda": "recommended"
}
async def execute(self, request: Dict[str, Any]) -> PluginResult:
"""Execute LLM inference"""
start_time = time.time()
try:
# Validate request
errors = self.validate_request(request)
if errors:
return PluginResult(
success=False,
error=f"Validation failed: {'; '.join(errors)}"
)
# Get parameters
prompt = request["prompt"]
model_name = request.get("model", "llama-7b")
max_tokens = request.get("max_tokens", 256)
temperature = request.get("temperature", 0.7)
top_p = request.get("top_p", 0.9)
do_sample = request.get("do_sample", True)
stream = request.get("stream", False)
# Load model and tokenizer
model, tokenizer = await self._load_model(model_name)
# Generate response
loop = asyncio.get_event_loop()
if stream:
# Streaming generation
generator = await loop.run_in_executor(
None,
lambda: self._generate_streaming(
model, tokenizer, prompt, max_tokens, temperature, top_p, do_sample
)
)
# Collect all tokens
full_response = ""
tokens = []
for token in generator:
tokens.append(token)
full_response += token
execution_time = time.time() - start_time
return PluginResult(
success=True,
data={
"text": full_response,
"tokens": tokens,
"streamed": True
},
metrics={
"model": model_name,
"prompt_tokens": len(tokenizer.encode(prompt)),
"generated_tokens": len(tokens),
"tokens_per_second": len(tokens) / execution_time if execution_time > 0 else 0
},
execution_time=execution_time
)
else:
# Regular generation
response = await loop.run_in_executor(
None,
lambda: self._generate(
model, tokenizer, prompt, max_tokens, temperature, top_p, do_sample
)
)
execution_time = time.time() - start_time
return PluginResult(
success=True,
data={
"text": response,
"streamed": False
},
metrics={
"model": model_name,
"prompt_tokens": len(tokenizer.encode(prompt)),
"generated_tokens": len(tokenizer.encode(response)) - len(tokenizer.encode(prompt)),
"tokens_per_second": (len(tokenizer.encode(response)) - len(tokenizer.encode(prompt))) / execution_time if execution_time > 0 else 0
},
execution_time=execution_time
)
except Exception as e:
return PluginResult(
success=False,
error=str(e),
execution_time=time.time() - start_time
)
async def _load_model(self, model_name: str):
"""Load LLM model and tokenizer with caching"""
if model_name not in self._model_cache:
loop = asyncio.get_event_loop()
# Map model names to HuggingFace model IDs
model_map = {
"llama-7b": "meta-llama/Llama-2-7b-chat-hf",
"llama-13b": "meta-llama/Llama-2-13b-chat-hf",
"mistral-7b": "mistralai/Mistral-7B-Instruct-v0.1",
"mixtral-8x7b": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"gpt-3.5-turbo": "openai-gpt", # Would need OpenAI API
"gpt-4": "openai-gpt-4" # Would need OpenAI API
}
hf_model = model_map.get(model_name, model_name)
# Load tokenizer
tokenizer = await loop.run_in_executor(
None,
lambda: self.AutoTokenizer.from_pretrained(hf_model)
)
# Load model
device = "cuda" if self.torch.cuda.is_available() else "cpu"
model = await loop.run_in_executor(
None,
lambda: self.transformers.from_pretrained(
hf_model,
torch_dtype=self.torch.float16 if device == "cuda" else self.torch.float32,
device_map="auto" if device == "cuda" else None,
load_in_4bit=True if device == "cuda" and self.vram_gb < 16 else False
)
)
self._model_cache[model_name] = (model, tokenizer)
return self._model_cache[model_name]
def _generate(
self,
model,
tokenizer,
prompt: str,
max_tokens: int,
temperature: float,
top_p: float,
do_sample: bool
) -> str:
"""Generate text without streaming"""
inputs = tokenizer(prompt, return_tensors="pt")
if self.torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
with self.torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=do_sample,
pad_token_id=tokenizer.eos_token_id
)
# Decode only the new tokens
new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
response = tokenizer.decode(new_tokens, skip_special_tokens=True)
return response
def _generate_streaming(
self,
model,
tokenizer,
prompt: str,
max_tokens: int,
temperature: float,
top_p: float,
do_sample: bool
):
"""Generate text with streaming"""
inputs = tokenizer(prompt, return_tensors="pt")
if self.torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
# Simple streaming implementation
# In production, you'd use model.generate with streamer
with self.torch.no_grad():
for i in range(max_tokens):
outputs = model.generate(
**inputs,
max_new_tokens=1,
temperature=temperature,
top_p=top_p,
do_sample=do_sample,
pad_token_id=tokenizer.eos_token_id
)
new_token = outputs[0][-1:]
text = tokenizer.decode(new_token, skip_special_tokens=True)
if text == tokenizer.eos_token:
break
yield text
# Update inputs for next iteration
inputs["input_ids"] = self.torch.cat([inputs["input_ids"], new_token], dim=1)
if "attention_mask" in inputs:
inputs["attention_mask"] = self.torch.cat([
inputs["attention_mask"],
self.torch.ones((1, 1), device=inputs["attention_mask"].device)
], dim=1)
async def health_check(self) -> bool:
"""Check LLM health"""
try:
# Try to load a small model
await self._load_model("mistral-7b")
return True
except Exception:
return False
def cleanup(self) -> None:
"""Cleanup resources"""
# Move models to CPU and clear cache
for model, _ in self._model_cache.values():
if hasattr(model, 'to'):
model.to("cpu")
self._model_cache.clear()
# Clear GPU cache
if self.torch.cuda.is_available():
self.torch.cuda.empty_cache()