# Importing Embeddings import chromadb import time from chromadb import Documents, EmbeddingFunction, Embeddings from google.api_core import retry from google.genai import types # Retry helper for API errors is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503}) class GeminiEmbeddingFunction(EmbeddingFunction): # Specify whether to generate embeddings for documents, or queries document_mode = True @retry.Retry(predicate=is_retriable) def __call__(self, input: Documents) -> Embeddings: # Define the embedding task based on document mode embedding_task = "retrieval_document" if self.document_mode else "retrieval_query" # Make the API call to Google GenAI for embeddings response = client.models.embed_content( model="models/text-embedding-004", contents=input, config=types.EmbedContentConfig(task_type=embedding_task), ) # Return embeddings return [e.values for e in response.embeddings] # Initialize Chroma client chroma_client = chromadb.Client(chromadb.config.Settings( persist_directory="./chroma_db" # Set the path for persistence )) # Get or create the collection in ChromaDB DB_NAME = "googlecardb" embed_fn = GeminiEmbeddingFunction() embed_fn.document_mode = True db = chroma_client.get_or_create_collection(name=DB_NAME, embedding_function=embed_fn) # Add documents to the collection documents = concatenated_list # Assuming `concatenated_list` is already defined for i, doc in enumerate(documents): db.add(documents=[doc], ids=[str(i)]) time.sleep(0.5) # Add a delay after each successful addition print(f"Added document with ID: {i}, Content (first 100 chars): {str(doc[:100])}") # Persist the entire Chroma client, not just the collection #db.persist() #or #client = chromadb.PersistentClient(path="./output") print(f"\nFinished adding {len(documents)} documents to the '{DB_NAME}' collection.") # Check how many documents are in the collection print(f"Collection contains {db.count()} documents.") # Peek at a sample document in the collection print(f"Sample document: {db.peek(1)}")