at1-workstation-scripts/import_embeddings.py

#!/usr/bin/env python3
# Script Version: 0.2
# Description: Import existing embeddings.json into ChromaDB using the new client API

import os
import json
from chromadb import PersistentClient

CHROMA_DIR = "chromadb"
COLLECTION_NAME = "cds_docs"
EMBEDDING_FILE = "embeddings.json"
CONTENT_DIR = "content"

# New Chroma client (post-migration)
client = PersistentClient(path=CHROMA_DIR)
collection = client.get_or_create_collection(name=COLLECTION_NAME)

# Load existing embeddings
with open(EMBEDDING_FILE, "r") as f:
    embeddings_data = json.load(f)

# Ingest each document
for filename, vector in embeddings_data.items():
    filepath = os.path.join(CONTENT_DIR, filename)
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            text = f.read().strip()
        collection.add(
            documents=[text],
            metadatas=[{"filename": filename}],
            ids=[filename],
            embeddings=[vector]
        )
    except FileNotFoundError:
        print(f"[WARN] Skipping missing file: {filepath}")

print("✅ Embeddings successfully imported into Chroma")