feat: add marketplace metrics, privacy features, and service registry endpoints
- Add Prometheus metrics for marketplace API throughput and error rates with new dashboard panels - Implement confidential transaction models with encryption support and access control - Add key management system with registration, rotation, and audit logging - Create services and registry routers for service discovery and management - Integrate ZK proof generation for privacy-preserving receipts - Add metrics instru
This commit is contained in:
321
apps/miner-node/plugins/llm_inference.py
Normal file
321
apps/miner-node/plugins/llm_inference.py
Normal file
@ -0,0 +1,321 @@
|
||||
"""
|
||||
LLM inference plugin
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import Dict, Any, List, Optional
|
||||
import time
|
||||
|
||||
from .base import GPUPlugin, PluginResult
|
||||
from .exceptions import PluginExecutionError
|
||||
|
||||
|
||||
class LLMPlugin(GPUPlugin):
|
||||
"""Plugin for Large Language Model inference"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.service_id = "llm_inference"
|
||||
self.name = "LLM Inference"
|
||||
self.version = "1.0.0"
|
||||
self.description = "Run inference on large language models"
|
||||
self.capabilities = ["generate", "stream", "chat"]
|
||||
self._model_cache = {}
|
||||
|
||||
def setup(self) -> None:
|
||||
"""Initialize LLM dependencies"""
|
||||
super().setup()
|
||||
|
||||
# Check for transformers installation
|
||||
try:
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
||||
self.transformers = AutoModelForCausalLM
|
||||
self.AutoTokenizer = AutoTokenizer
|
||||
self.pipeline = pipeline
|
||||
except ImportError:
|
||||
raise PluginExecutionError("Transformers not installed. Install with: pip install transformers accelerate")
|
||||
|
||||
# Check for torch
|
||||
try:
|
||||
import torch
|
||||
self.torch = torch
|
||||
except ImportError:
|
||||
raise PluginExecutionError("PyTorch not installed. Install with: pip install torch")
|
||||
|
||||
def validate_request(self, request: Dict[str, Any]) -> List[str]:
|
||||
"""Validate LLM request parameters"""
|
||||
errors = []
|
||||
|
||||
# Check required parameters
|
||||
if "prompt" not in request:
|
||||
errors.append("'prompt' is required")
|
||||
|
||||
# Validate model
|
||||
model = request.get("model", "llama-7b")
|
||||
valid_models = [
|
||||
"llama-7b",
|
||||
"llama-13b",
|
||||
"mistral-7b",
|
||||
"mixtral-8x7b",
|
||||
"gpt-3.5-turbo",
|
||||
"gpt-4"
|
||||
]
|
||||
if model not in valid_models:
|
||||
errors.append(f"Invalid model. Must be one of: {', '.join(valid_models)}")
|
||||
|
||||
# Validate max_tokens
|
||||
max_tokens = request.get("max_tokens", 256)
|
||||
if not isinstance(max_tokens, int) or max_tokens < 1 or max_tokens > 4096:
|
||||
errors.append("max_tokens must be an integer between 1 and 4096")
|
||||
|
||||
# Validate temperature
|
||||
temperature = request.get("temperature", 0.7)
|
||||
if not isinstance(temperature, (int, float)) or temperature < 0.0 or temperature > 2.0:
|
||||
errors.append("temperature must be between 0.0 and 2.0")
|
||||
|
||||
# Validate top_p
|
||||
top_p = request.get("top_p")
|
||||
if top_p is not None and (not isinstance(top_p, (int, float)) or top_p <= 0.0 or top_p > 1.0):
|
||||
errors.append("top_p must be between 0.0 and 1.0")
|
||||
|
||||
return errors
|
||||
|
||||
def get_hardware_requirements(self) -> Dict[str, Any]:
|
||||
"""Get hardware requirements for LLM inference"""
|
||||
return {
|
||||
"gpu": "recommended",
|
||||
"vram_gb": 8,
|
||||
"ram_gb": 16,
|
||||
"cuda": "recommended"
|
||||
}
|
||||
|
||||
async def execute(self, request: Dict[str, Any]) -> PluginResult:
|
||||
"""Execute LLM inference"""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Validate request
|
||||
errors = self.validate_request(request)
|
||||
if errors:
|
||||
return PluginResult(
|
||||
success=False,
|
||||
error=f"Validation failed: {'; '.join(errors)}"
|
||||
)
|
||||
|
||||
# Get parameters
|
||||
prompt = request["prompt"]
|
||||
model_name = request.get("model", "llama-7b")
|
||||
max_tokens = request.get("max_tokens", 256)
|
||||
temperature = request.get("temperature", 0.7)
|
||||
top_p = request.get("top_p", 0.9)
|
||||
do_sample = request.get("do_sample", True)
|
||||
stream = request.get("stream", False)
|
||||
|
||||
# Load model and tokenizer
|
||||
model, tokenizer = await self._load_model(model_name)
|
||||
|
||||
# Generate response
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
if stream:
|
||||
# Streaming generation
|
||||
generator = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: self._generate_streaming(
|
||||
model, tokenizer, prompt, max_tokens, temperature, top_p, do_sample
|
||||
)
|
||||
)
|
||||
|
||||
# Collect all tokens
|
||||
full_response = ""
|
||||
tokens = []
|
||||
for token in generator:
|
||||
tokens.append(token)
|
||||
full_response += token
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
return PluginResult(
|
||||
success=True,
|
||||
data={
|
||||
"text": full_response,
|
||||
"tokens": tokens,
|
||||
"streamed": True
|
||||
},
|
||||
metrics={
|
||||
"model": model_name,
|
||||
"prompt_tokens": len(tokenizer.encode(prompt)),
|
||||
"generated_tokens": len(tokens),
|
||||
"tokens_per_second": len(tokens) / execution_time if execution_time > 0 else 0
|
||||
},
|
||||
execution_time=execution_time
|
||||
)
|
||||
else:
|
||||
# Regular generation
|
||||
response = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: self._generate(
|
||||
model, tokenizer, prompt, max_tokens, temperature, top_p, do_sample
|
||||
)
|
||||
)
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
return PluginResult(
|
||||
success=True,
|
||||
data={
|
||||
"text": response,
|
||||
"streamed": False
|
||||
},
|
||||
metrics={
|
||||
"model": model_name,
|
||||
"prompt_tokens": len(tokenizer.encode(prompt)),
|
||||
"generated_tokens": len(tokenizer.encode(response)) - len(tokenizer.encode(prompt)),
|
||||
"tokens_per_second": (len(tokenizer.encode(response)) - len(tokenizer.encode(prompt))) / execution_time if execution_time > 0 else 0
|
||||
},
|
||||
execution_time=execution_time
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return PluginResult(
|
||||
success=False,
|
||||
error=str(e),
|
||||
execution_time=time.time() - start_time
|
||||
)
|
||||
|
||||
async def _load_model(self, model_name: str):
|
||||
"""Load LLM model and tokenizer with caching"""
|
||||
if model_name not in self._model_cache:
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
# Map model names to HuggingFace model IDs
|
||||
model_map = {
|
||||
"llama-7b": "meta-llama/Llama-2-7b-chat-hf",
|
||||
"llama-13b": "meta-llama/Llama-2-13b-chat-hf",
|
||||
"mistral-7b": "mistralai/Mistral-7B-Instruct-v0.1",
|
||||
"mixtral-8x7b": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"gpt-3.5-turbo": "openai-gpt", # Would need OpenAI API
|
||||
"gpt-4": "openai-gpt-4" # Would need OpenAI API
|
||||
}
|
||||
|
||||
hf_model = model_map.get(model_name, model_name)
|
||||
|
||||
# Load tokenizer
|
||||
tokenizer = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: self.AutoTokenizer.from_pretrained(hf_model)
|
||||
)
|
||||
|
||||
# Load model
|
||||
device = "cuda" if self.torch.cuda.is_available() else "cpu"
|
||||
model = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: self.transformers.from_pretrained(
|
||||
hf_model,
|
||||
torch_dtype=self.torch.float16 if device == "cuda" else self.torch.float32,
|
||||
device_map="auto" if device == "cuda" else None,
|
||||
load_in_4bit=True if device == "cuda" and self.vram_gb < 16 else False
|
||||
)
|
||||
)
|
||||
|
||||
self._model_cache[model_name] = (model, tokenizer)
|
||||
|
||||
return self._model_cache[model_name]
|
||||
|
||||
def _generate(
|
||||
self,
|
||||
model,
|
||||
tokenizer,
|
||||
prompt: str,
|
||||
max_tokens: int,
|
||||
temperature: float,
|
||||
top_p: float,
|
||||
do_sample: bool
|
||||
) -> str:
|
||||
"""Generate text without streaming"""
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
|
||||
if self.torch.cuda.is_available():
|
||||
inputs = {k: v.cuda() for k, v in inputs.items()}
|
||||
|
||||
with self.torch.no_grad():
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
do_sample=do_sample,
|
||||
pad_token_id=tokenizer.eos_token_id
|
||||
)
|
||||
|
||||
# Decode only the new tokens
|
||||
new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
|
||||
response = tokenizer.decode(new_tokens, skip_special_tokens=True)
|
||||
|
||||
return response
|
||||
|
||||
def _generate_streaming(
|
||||
self,
|
||||
model,
|
||||
tokenizer,
|
||||
prompt: str,
|
||||
max_tokens: int,
|
||||
temperature: float,
|
||||
top_p: float,
|
||||
do_sample: bool
|
||||
):
|
||||
"""Generate text with streaming"""
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
|
||||
if self.torch.cuda.is_available():
|
||||
inputs = {k: v.cuda() for k, v in inputs.items()}
|
||||
|
||||
# Simple streaming implementation
|
||||
# In production, you'd use model.generate with streamer
|
||||
with self.torch.no_grad():
|
||||
for i in range(max_tokens):
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=1,
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
do_sample=do_sample,
|
||||
pad_token_id=tokenizer.eos_token_id
|
||||
)
|
||||
|
||||
new_token = outputs[0][-1:]
|
||||
text = tokenizer.decode(new_token, skip_special_tokens=True)
|
||||
|
||||
if text == tokenizer.eos_token:
|
||||
break
|
||||
|
||||
yield text
|
||||
|
||||
# Update inputs for next iteration
|
||||
inputs["input_ids"] = self.torch.cat([inputs["input_ids"], new_token], dim=1)
|
||||
if "attention_mask" in inputs:
|
||||
inputs["attention_mask"] = self.torch.cat([
|
||||
inputs["attention_mask"],
|
||||
self.torch.ones((1, 1), device=inputs["attention_mask"].device)
|
||||
], dim=1)
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
"""Check LLM health"""
|
||||
try:
|
||||
# Try to load a small model
|
||||
await self._load_model("mistral-7b")
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def cleanup(self) -> None:
|
||||
"""Cleanup resources"""
|
||||
# Move models to CPU and clear cache
|
||||
for model, _ in self._model_cache.values():
|
||||
if hasattr(model, 'to'):
|
||||
model.to("cpu")
|
||||
self._model_cache.clear()
|
||||
|
||||
# Clear GPU cache
|
||||
if self.torch.cuda.is_available():
|
||||
self.torch.cuda.empty_cache()
|
||||
Reference in New Issue
Block a user