Update 2025-04-13_15:16:39
This commit is contained in:
		
							
								
								
									
										43
									
								
								wrap_embeddings.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										43
									
								
								wrap_embeddings.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,43 @@ | ||||
| #!/bin/bash | ||||
| # Script Version: 0.3 | ||||
| # Description: Convert each .txt in content/ to .json with embedding in json/ | ||||
|  | ||||
| # Set variables | ||||
| CONTENT_DIR="./content" | ||||
| JSON_DIR="./json" | ||||
| EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2" | ||||
|  | ||||
| # Check dependencies | ||||
| if ! python3 -c "import sentence_transformers" 2>/dev/null; then | ||||
|   echo "[ERROR] ❌ sentence-transformers not installed. Run: pip3 install sentence-transformers" | ||||
|   exit 1 | ||||
| fi | ||||
|  | ||||
| # Check input files | ||||
| mkdir -p "$JSON_DIR" | ||||
| if [ ! -d "$CONTENT_DIR" ] || ! ls "$CONTENT_DIR"/*.txt >/dev/null 2>&1; then | ||||
|   echo "[ERROR] ❌ No .txt files found in $CONTENT_DIR" | ||||
|   exit 1 | ||||
| fi | ||||
|  | ||||
| # Generate embeddings | ||||
| python3 -c " | ||||
| import sys, json, os | ||||
| from sentence_transformers import SentenceTransformer | ||||
| content_dir, json_dir = sys.argv[1], sys.argv[2] | ||||
| model = SentenceTransformer('${EMBEDDING_MODEL}') | ||||
| for txt_file in os.listdir(content_dir): | ||||
|     if txt_file.endswith('.txt'): | ||||
|         base_name = txt_file[:-4] | ||||
|         try: | ||||
|             with open(os.path.join(content_dir, txt_file), 'r', encoding='utf-8') as f: | ||||
|                 text = f.read() | ||||
|             embedding = model.encode([text])[0].tolist() | ||||
|             with open(os.path.join(json_dir, f'{base_name}.json'), 'w') as f: | ||||
|                 json.dump({'id': base_name, 'text': text, 'embedding': embedding}, f) | ||||
|             print(f'[DEBUG] ✅ Saved: {json_dir}/{base_name}.json') | ||||
|         except Exception as e: | ||||
|             print(f'[ERROR] ❌ Failed: {txt_file} - {str(e)}', file=sys.stderr) | ||||
| " "$CONTENT_DIR" "$JSON_DIR" 2>&1 | while read -r line; do echo "$line"; done | ||||
|  | ||||
| echo "✅ All .txt files converted to JSON with embeddings in $JSON_DIR" | ||||
		Reference in New Issue
	
	Block a user