import json import chromadb import firebase_admin from firebase_admin import credentials, firestore from encoder import SentenceEncoder def initialize_firebase_with_file(): """Initializes Firebase using a local serviceAccountKey.json file.""" try: # Use the service account key file cred = credentials.Certificate("serviceAccountKey.json") if not firebase_admin._apps: firebase_admin.initialize_app(cred) db = firestore.client() print("✅ Firebase connection initialized from file.") return db except Exception as e: print(f"❌ Could not initialize Firebase from file. Error: {e}") print(" - Make sure 'serviceAccountKey.json' has been uploaded to the terminal.") return None def populate_vector_db(): """ Reads internships from Firestore, generates embeddings, and populates ChromaDB. """ db = initialize_firebase_with_file() if db is None: return # 1. Initialize other clients encoder = SentenceEncoder() chroma_client = chromadb.PersistentClient(path="/data/chroma_db") collection = chroma_client.get_or_create_collection(name="internships") # 2. Clear existing data if collection.count() > 0: print(f"â„šī¸ Clearing {collection.count()} existing items from ChromaDB.") collection.delete(ids=collection.get()['ids']) # 3. Fetch data from Firestore print("📚 Reading internship data from Firestore...") internships_ref = db.collection('internships').stream() internships = [doc.to_dict() for doc in internships_ref] if not internships: print("❌ No internship data found in Firestore.") return # 4. Generate embeddings print(f"🧠 Generating embeddings for {len(internships)} internships...") texts = [f"{i['title']}. {i['description']}. Skills: {', '.join(i['skills'])}" for i in internships] embeddings = encoder.encode(texts, show_progress_bar=True).tolist() ids = [i['id'] for i in internships] metadatas = [] for i in internships: i['skills'] = json.dumps(i['skills']) metadatas.append(i) # 5. Add to ChromaDB print("➕ Adding data to ChromaDB...") collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas) print(f"✅ Successfully populated ChromaDB with {collection.count()} items.") if __name__ == "__main__": populate_vector_db()