39 lines
1.1 KiB
Python
39 lines
1.1 KiB
Python
#!/usr/bin/env python3
|
|
# Script Version: 0.2
|
|
# Description: Import existing embeddings.json into ChromaDB using the new client API
|
|
|
|
import os
|
|
import json
|
|
from chromadb import PersistentClient
|
|
|
|
CHROMA_DIR = "chromadb"
|
|
COLLECTION_NAME = "cds_docs"
|
|
EMBEDDING_FILE = "embeddings.json"
|
|
CONTENT_DIR = "content"
|
|
|
|
# New Chroma client (post-migration)
|
|
client = PersistentClient(path=CHROMA_DIR)
|
|
collection = client.get_or_create_collection(name=COLLECTION_NAME)
|
|
|
|
# Load existing embeddings
|
|
with open(EMBEDDING_FILE, "r") as f:
|
|
embeddings_data = json.load(f)
|
|
|
|
# Ingest each document
|
|
for filename, vector in embeddings_data.items():
|
|
filepath = os.path.join(CONTENT_DIR, filename)
|
|
try:
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
|
text = f.read().strip()
|
|
collection.add(
|
|
documents=[text],
|
|
metadatas=[{"filename": filename}],
|
|
ids=[filename],
|
|
embeddings=[vector]
|
|
)
|
|
except FileNotFoundError:
|
|
print(f"[WARN] Skipping missing file: {filepath}")
|
|
|
|
print("✅ Embeddings successfully imported into Chroma")
|
|
|