""" Vector Store for Multi-Document Chat with RAG (Retrieval-Augmented Generation) Supports document embeddings, tagging, and semantic search for chatting with multiple documents at once. """ import os import json from typing import List, Dict, Any, Optional from dataclasses import dataclass, asdict from datetime import datetime import hashlib @dataclass class DocumentChunk: """A chunk of a document with metadata.""" chunk_id: str document_id: str content: str chunk_index: int embedding: Optional[List[float]] = None metadata: Optional[Dict[str, Any]] = None @dataclass class Document: """Document metadata with tags.""" document_id: str user_id: str site_id: str file_path: str filename: str tags: List[str] content_hash: str created_at: str updated_at: str chunk_count: int metadata: Optional[Dict[str, Any]] = None class EmbeddingProvider: """Base class for embedding providers.""" def embed_text(self, text: str) -> List[float]: """Generate embedding for text.""" raise NotImplementedError def embed_batch(self, texts: List[str]) -> List[List[float]]: """Generate embeddings for multiple texts.""" return [self.embed_text(text) for text in texts] class OllamaEmbeddings(EmbeddingProvider): """Ollama embeddings using local models.""" def __init__(self, base_url: str = "http://localhost:11434", model: str = "nomic-embed-text"): """ Initialize Ollama embeddings. Args: base_url: Ollama server URL model: Embedding model (e.g., nomic-embed-text, mxbai-embed-large) """ import requests self.base_url = base_url.rstrip('/') self.model = model self.requests = requests def embed_text(self, text: str) -> List[float]: """Generate embedding using Ollama.""" response = self.requests.post( f"{self.base_url}/api/embeddings", json={ "model": self.model, "prompt": text } ) response.raise_for_status() return response.json()["embedding"] class OpenAIEmbeddings(EmbeddingProvider): """OpenAI embeddings.""" def __init__(self, api_key: str, model: str = "text-embedding-3-small"): """ Initialize OpenAI embeddings. Args: api_key: OpenAI API key model: Embedding model (text-embedding-3-small or text-embedding-3-large) """ import requests self.api_key = api_key self.model = model self.requests = requests def embed_text(self, text: str) -> List[float]: """Generate embedding using OpenAI.""" response = self.requests.post( "https://api.openai.com/v1/embeddings", headers={ "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json" }, json={ "input": text, "model": self.model } ) response.raise_for_status() return response.json()["data"][0]["embedding"] class InMemoryVectorStore: """ Simple in-memory vector store for development. For production, use a proper vector database like: - Pinecone - Weaviate - Qdrant - ChromaDB - pgvector (PostgreSQL extension) """ def __init__(self, embedding_provider: EmbeddingProvider): """Initialize vector store.""" self.embedding_provider = embedding_provider self.documents: Dict[str, Document] = {} self.chunks: Dict[str, DocumentChunk] = {} self.user_documents: Dict[str, List[str]] = {} # user_id -> [document_ids] self.tag_index: Dict[str, List[str]] = {} # tag -> [document_ids] def add_document( self, user_id: str, site_id: str, file_path: str, filename: str, content: str, tags: List[str], chunk_size: int = 1000, chunk_overlap: int = 200, metadata: Optional[Dict[str, Any]] = None ) -> str: """ Add a document to the vector store with chunking and embeddings. Args: user_id: User ID site_id: SharePoint site ID file_path: File path filename: Filename content: Document content tags: List of tags (e.g., ["HR", "SALES", "Q4-2024"]) chunk_size: Size of each chunk in characters chunk_overlap: Overlap between chunks metadata: Additional metadata Returns: document_id """ # Generate document ID document_id = self._generate_document_id(user_id, site_id, file_path) # Calculate content hash content_hash = hashlib.sha256(content.encode()).hexdigest() # Check if document already exists and hasn't changed if document_id in self.documents: existing_doc = self.documents[document_id] if existing_doc.content_hash == content_hash: # Update tags if different if set(tags) != set(existing_doc.tags): self._update_tags(document_id, existing_doc.tags, tags) existing_doc.tags = tags existing_doc.updated_at = datetime.utcnow().isoformat() return document_id else: # Content changed, remove old chunks self._remove_document_chunks(document_id) # Chunk the document chunks = self._chunk_text(content, chunk_size, chunk_overlap) # Generate embeddings for all chunks chunk_texts = [chunk for chunk in chunks] embeddings = self.embedding_provider.embed_batch(chunk_texts) # Create document chunks document_chunks = [] for idx, (chunk_text, embedding) in enumerate(zip(chunk_texts, embeddings)): chunk_id = f"{document_id}_chunk_{idx}" chunk = DocumentChunk( chunk_id=chunk_id, document_id=document_id, content=chunk_text, chunk_index=idx, embedding=embedding, metadata=metadata ) self.chunks[chunk_id] = chunk document_chunks.append(chunk) # Create document metadata now = datetime.utcnow().isoformat() document = Document( document_id=document_id, user_id=user_id, site_id=site_id, file_path=file_path, filename=filename, tags=tags, content_hash=content_hash, created_at=now, updated_at=now, chunk_count=len(document_chunks), metadata=metadata ) # Store document self.documents[document_id] = document # Update user index if user_id not in self.user_documents: self.user_documents[user_id] = [] if document_id not in self.user_documents[user_id]: self.user_documents[user_id].append(document_id) # Update tag index for tag in tags: tag_lower = tag.lower() if tag_lower not in self.tag_index: self.tag_index[tag_lower] = [] if document_id not in self.tag_index[tag_lower]: self.tag_index[tag_lower].append(document_id) return document_id def search( self, user_id: str, query: str, tags: Optional[List[str]] = None, top_k: int = 5 ) -> List[Dict[str, Any]]: """ Search for relevant document chunks. Args: user_id: User ID (for access control) query: Search query tags: Optional list of tags to filter by top_k: Number of results to return Returns: List of relevant chunks with similarity scores """ # Get query embedding query_embedding = self.embedding_provider.embed_text(query) # Get candidate document IDs if tags: # Filter by tags candidate_doc_ids = set() for tag in tags: tag_lower = tag.lower() if tag_lower in self.tag_index: candidate_doc_ids.update(self.tag_index[tag_lower]) # Intersect with user's documents user_doc_ids = set(self.user_documents.get(user_id, [])) candidate_doc_ids = candidate_doc_ids.intersection(user_doc_ids) else: # All user's documents candidate_doc_ids = set(self.user_documents.get(user_id, [])) # Get chunks from candidate documents candidate_chunks = [ chunk for chunk in self.chunks.values() if chunk.document_id in candidate_doc_ids ] # Calculate cosine similarity for each chunk results = [] for chunk in candidate_chunks: if chunk.embedding: similarity = self._cosine_similarity(query_embedding, chunk.embedding) results.append({ "chunk": chunk, "document": self.documents[chunk.document_id], "similarity": similarity }) # Sort by similarity and return top_k results.sort(key=lambda x: x["similarity"], reverse=True) return results[:top_k] def get_documents_by_tags(self, user_id: str, tags: List[str]) -> List[Document]: """Get all documents with specific tags.""" doc_ids = set() for tag in tags: tag_lower = tag.lower() if tag_lower in self.tag_index: doc_ids.update(self.tag_index[tag_lower]) # Filter by user user_doc_ids = set(self.user_documents.get(user_id, [])) doc_ids = doc_ids.intersection(user_doc_ids) return [self.documents[doc_id] for doc_id in doc_ids if doc_id in self.documents] def list_tags(self, user_id: str) -> Dict[str, int]: """List all tags for user with document counts.""" user_doc_ids = set(self.user_documents.get(user_id, [])) tag_counts = {} for tag, doc_ids in self.tag_index.items(): user_tagged_docs = set(doc_ids).intersection(user_doc_ids) if user_tagged_docs: tag_counts[tag] = len(user_tagged_docs) return tag_counts def update_document_tags(self, document_id: str, user_id: str, tags: List[str]): """Update tags for a document.""" if document_id not in self.documents: raise ValueError("Document not found") doc = self.documents[document_id] if doc.user_id != user_id: raise ValueError("Access denied") old_tags = doc.tags self._update_tags(document_id, old_tags, tags) doc.tags = tags doc.updated_at = datetime.utcnow().isoformat() def remove_document(self, document_id: str, user_id: str): """Remove a document and its chunks.""" if document_id not in self.documents: return doc = self.documents[document_id] if doc.user_id != user_id: raise ValueError("Access denied") # Remove from tag index for tag in doc.tags: tag_lower = tag.lower() if tag_lower in self.tag_index: self.tag_index[tag_lower] = [ d for d in self.tag_index[tag_lower] if d != document_id ] # Remove chunks self._remove_document_chunks(document_id) # Remove from user index if user_id in self.user_documents: self.user_documents[user_id] = [ d for d in self.user_documents[user_id] if d != document_id ] # Remove document del self.documents[document_id] def _chunk_text(self, text: str, chunk_size: int, overlap: int) -> List[str]: """Split text into overlapping chunks.""" if not text: return [] chunks = [] start = 0 text_length = len(text) while start < text_length: end = start + chunk_size # Try to break at sentence boundary if end < text_length: # Look for sentence endings for punct in ['. ', '! ', '? ', '\n\n']: last_punct = text.rfind(punct, start, end) if last_punct != -1: end = last_punct + len(punct) break chunk = text[start:end].strip() if chunk: chunks.append(chunk) start = end - overlap if end < text_length else text_length return chunks def _cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float: """Calculate cosine similarity between two vectors.""" import math dot_product = sum(a * b for a, b in zip(vec1, vec2)) magnitude1 = math.sqrt(sum(a * a for a in vec1)) magnitude2 = math.sqrt(sum(b * b for b in vec2)) if magnitude1 == 0 or magnitude2 == 0: return 0.0 return dot_product / (magnitude1 * magnitude2) def _generate_document_id(self, user_id: str, site_id: str, file_path: str) -> str: """Generate unique document ID.""" combined = f"{user_id}:{site_id}:{file_path}" return hashlib.sha256(combined.encode()).hexdigest() def _remove_document_chunks(self, document_id: str): """Remove all chunks for a document.""" chunk_ids_to_remove = [ chunk_id for chunk_id, chunk in self.chunks.items() if chunk.document_id == document_id ] for chunk_id in chunk_ids_to_remove: del self.chunks[chunk_id] def _update_tags(self, document_id: str, old_tags: List[str], new_tags: List[str]): """Update tag index when tags change.""" # Remove from old tags for tag in old_tags: tag_lower = tag.lower() if tag_lower in self.tag_index: self.tag_index[tag_lower] = [ d for d in self.tag_index[tag_lower] if d != document_id ] # Add to new tags for tag in new_tags: tag_lower = tag.lower() if tag_lower not in self.tag_index: self.tag_index[tag_lower] = [] if document_id not in self.tag_index[tag_lower]: self.tag_index[tag_lower].append(document_id) def create_embedding_provider(provider: str = "ollama", **kwargs) -> EmbeddingProvider: """ Factory function to create embedding provider. Args: provider: Provider name ("ollama" or "openai") **kwargs: Provider-specific configuration Returns: EmbeddingProvider instance Example: # Ollama (default) embeddings = create_embedding_provider("ollama", model="nomic-embed-text") # OpenAI embeddings = create_embedding_provider("openai", api_key="sk-...") """ if provider.lower() == "ollama": return OllamaEmbeddings( base_url=kwargs.get("base_url", os.getenv("OLLAMA_URL", "http://localhost:11434")), model=kwargs.get("model", os.getenv("OLLAMA_EMBED_MODEL", "nomic-embed-text")) ) elif provider.lower() == "openai": api_key = kwargs.get("api_key", os.getenv("OPENAI_API_KEY")) if not api_key: raise ValueError("OpenAI API key required") return OpenAIEmbeddings( api_key=api_key, model=kwargs.get("model", "text-embedding-3-small") ) else: raise ValueError(f"Unknown embedding provider: {provider}")