""" SharePoint Connector Web Application - Development Version For local testing with DynamoDB Local. Users input their Azure credentials through the web interface. """ import os import secrets import json from flask import Flask, render_template, redirect, request, session, jsonify from functools import wraps from saas_connector_dynamodb import DynamoDBSharePointConnector, SecureSharePointClient from cryptography.fernet import Fernet from llm_client import create_llm_client from document_parser import DocumentParser, get_file_info from vector_store_postgres import PostgreSQLVectorStore, create_embedding_provider from background_indexer import get_indexer from storage.credentials_storage import get_credentials_storage from dotenv import load_dotenv load_dotenv() # Initialize Flask app app = Flask(__name__) app.secret_key = os.getenv("FLASK_SECRET_KEY", "dev-secret-key-change-in-production") # Global connector (will be initialized with user credentials) connector = None # Persistent credentials storage credentials_storage = get_credentials_storage() # In-memory cache for user configs (loaded from persistent storage) USER_CONFIGS = {} # Chat conversation storage (in-memory, per user per document) CHAT_CONVERSATIONS = {} # Initialize LLM client # Default to Ollama, but can be swapped using environment variables llm_provider = os.getenv("LLM_PROVIDER", "ollama") llm_client = create_llm_client(llm_provider) # Initialize document parser document_parser = DocumentParser() # Initialize vector store for multi-document chat with RAG (PostgreSQL + pgvector) embedding_provider_type = os.getenv("EMBEDDING_PROVIDER", "ollama") try: embedding_provider = create_embedding_provider(embedding_provider_type) vector_store = PostgreSQLVectorStore( embedding_provider=embedding_provider, table_prefix="dev_" ) print("āœ… PostgreSQL vector store initialized") except Exception as e: print(f"āš ļø Warning: Could not initialize vector store: {e}") print(" Multi-document chat will not be available") print(" Make sure PostgreSQL with pgvector extension is running") vector_store = None def get_or_create_connector(): """Get or create connector with user's credentials.""" global connector user_id = session.get("user_id") if not user_id: # Create demo user ID user_id = "user_" + secrets.token_hex(8) session["user_id"] = user_id # Check if user has configured credentials if user_id not in USER_CONFIGS: return None config = USER_CONFIGS[user_id] # Create connector with user's credentials if not connector or getattr(connector, '_user_id', None) != user_id: connector = DynamoDBSharePointConnector( client_id=config["client_id"], client_secret=config["client_secret"], redirect_uri=request.url_root.rstrip('/') + '/sharepoint/callback', encryption_key=config["encryption_key"], aws_region="ap-southeast-2", dynamodb_endpoint="http://localhost:8000", # Local DynamoDB table_prefix="dev_", tenant_id=config["tenant_id"] ) connector._user_id = user_id return connector # Authentication decorator def require_auth(f): """Decorator to require authentication.""" @wraps(f) def decorated_function(*args, **kwargs): if "user_id" not in session: session["user_id"] = "user_" + secrets.token_hex(8) return f(*args, **kwargs) return decorated_function def require_config(f): """Decorator to require configuration.""" @wraps(f) def decorated_function(*args, **kwargs): conn = get_or_create_connector() if not conn: return jsonify({"error": "Not configured"}), 400 return f(*args, **kwargs) return decorated_function # ============================================================================ # WEB UI ROUTES # ============================================================================ @app.route("/") @require_auth def index(): """Main page with web UI.""" return render_template("index.html") # ============================================================================ # CONFIGURATION API # ============================================================================ @app.route("/api/config/check") @require_auth def check_config(): """Check if user has configured credentials.""" user_id = session.get("user_id") # Check in-memory cache first if user_id in USER_CONFIGS: return jsonify({"configured": True}) # Check persistent storage stored_config = credentials_storage.get_config(user_id) if stored_config: # Load into memory cache USER_CONFIGS[user_id] = stored_config return jsonify({"configured": True}) return jsonify({"configured": False}) @app.route("/api/config/save", methods=["POST"]) @require_auth def save_config(): """Save user's Azure credentials.""" try: data = request.json client_id = data.get("client_id") client_secret = data.get("client_secret") tenant_id = data.get("tenant_id", "common") if not client_id or not client_secret: return jsonify({"error": "Missing client_id or client_secret"}), 400 user_id = session["user_id"] # Generate encryption key for this user (or reuse existing) existing_config = credentials_storage.get_config(user_id) encryption_key = existing_config.get("encryption_key") if existing_config else Fernet.generate_key().decode() # Store configuration config = { "client_id": client_id, "client_secret": client_secret, "tenant_id": tenant_id, "encryption_key": encryption_key } # Save to persistent storage credentials_storage.save_config(user_id, config) # Update in-memory cache USER_CONFIGS[user_id] = config return jsonify({"success": True}) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/config/reset", methods=["POST"]) @require_auth def reset_config(): """Reset user's configuration.""" user_id = session.get("user_id") # Delete from persistent storage credentials_storage.delete_config(user_id) # Delete from in-memory cache if user_id in USER_CONFIGS: del USER_CONFIGS[user_id] return jsonify({"success": True}) # ============================================================================ # SHAREPOINT OAUTH ROUTES # ============================================================================ @app.route("/sharepoint/connect") @require_auth @require_config def connect_sharepoint(): """Initiate SharePoint connection.""" conn = get_or_create_connector() user_id = session["user_id"] auth_url = conn.initiate_connection( user_id=user_id, organization_id="default_org", return_url="/" ) return redirect(auth_url) @app.route("/sharepoint/callback") def sharepoint_callback(): """OAuth callback endpoint.""" if "error" in request.args: return render_template("error.html", error=request.args.get("error_description", request.args["error"])) auth_code = request.args.get("code") state = request.args.get("state") if not auth_code or not state: return render_template("error.html", error="Invalid callback - missing code or state") try: conn = get_or_create_connector() if not conn: return render_template("error.html", error="Configuration not found") connection_info = conn.complete_connection( auth_code=auth_code, state=state, ip_address=request.remote_addr, user_agent=request.headers.get("User-Agent") ) # Store connection ID in session session["sharepoint_connection_id"] = connection_info.id return redirect("/") except Exception as e: return render_template("error.html", error=str(e)) # ============================================================================ # API ROUTES # ============================================================================ @app.route("/api/sharepoint/connections") @require_auth @require_config def list_connections(): """List user's SharePoint connections.""" conn = get_or_create_connector() user_id = session["user_id"] try: connections = conn.list_connections(user_id) return jsonify({ "connections": [ { "id": c.id, "name": c.connection_name, "created_at": c.created_at.isoformat(), "last_used_at": c.last_used_at.isoformat() if c.last_used_at else None } for c in connections ] }) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/sharepoint/connections//disconnect", methods=["POST"]) @require_auth @require_config def disconnect_sharepoint(connection_id): """Disconnect SharePoint.""" conn = get_or_create_connector() user_id = session["user_id"] try: conn.disconnect( connection_id=connection_id, user_id=user_id, ip_address=request.remote_addr ) # Clear session if "sharepoint_connection_id" in session: del session["sharepoint_connection_id"] return jsonify({"success": True}) except Exception as e: return jsonify({"error": str(e)}), 400 @app.route("/api/sharepoint//sites") @require_auth @require_config def get_sites(connection_id): """Get SharePoint sites.""" conn = get_or_create_connector() user_id = session["user_id"] try: client = SecureSharePointClient(conn, connection_id, user_id) sites = client.list_sites() return jsonify({"sites": sites}) except Exception as e: import traceback print(f"Error listing sites: {str(e)}") print(traceback.format_exc()) return jsonify({"error": str(e)}), 500 @app.route("/api/sharepoint//files") @require_auth @require_config def get_files(connection_id): """Get files from SharePoint.""" conn = get_or_create_connector() user_id = session["user_id"] site_id = request.args.get("site_id") path = request.args.get("path", "") if not site_id: return jsonify({"error": "site_id is required"}), 400 try: client = SecureSharePointClient(conn, connection_id, user_id) files = client.list_files(site_id, path) return jsonify({"files": files}) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/sharepoint//read") @require_auth @require_config def read_file(connection_id): """Read file content with automatic parsing for different file types.""" conn = get_or_create_connector() user_id = session["user_id"] site_id = request.args.get("site_id") file_path = request.args.get("file_path") if not site_id or not file_path: return jsonify({"error": "site_id and file_path are required"}), 400 try: client = SecureSharePointClient(conn, connection_id, user_id) # Get filename from path filename = file_path.split('/')[-1] # Read file as binary binary_content = client.read_file(site_id, file_path, as_text=False) # Try to parse the file content = None parse_error = None if document_parser.can_parse(filename): try: content = document_parser.parse(binary_content, filename) except Exception as parse_err: parse_error = str(parse_err) # Fallback to text decoding try: content = binary_content.decode('utf-8', errors='ignore') except: content = f"[Unable to parse file: {parse_error}]" else: # Unsupported file type - try text decode try: content = binary_content.decode('utf-8', errors='ignore') except: # Get file info for context file_info = get_file_info(filename, len(binary_content)) content = f"[{file_info['category'].title()} file - {file_info['size_formatted']} - Preview not available for {file_info['extension']} files]" # Store file context for chat chat_key = f"{user_id}:{site_id}:{file_path}" if chat_key not in CHAT_CONVERSATIONS: CHAT_CONVERSATIONS[chat_key] = { "content": content, "messages": [], "filename": filename } else: CHAT_CONVERSATIONS[chat_key]["content"] = content CHAT_CONVERSATIONS[chat_key]["filename"] = filename # Optionally add to vector store if available and content is parseable # This allows the document to be used in multi-document chat document_id = None if vector_store and content and (document_parser.can_parse(filename) or parse_error is None): try: # Don't add error messages to vector store if not content.startswith("[") or not content.endswith("]"): document_id = vector_store.add_document( user_id=user_id, site_id=site_id, file_path=file_path, filename=filename, content=content, tags=[], # No tags by default, user can add later chunk_size=1000, chunk_overlap=200 ) except Exception as vs_error: # Don't fail the request if vector store fails print(f"Warning: Could not add to vector store: {vs_error}") return jsonify({ "content": content, "filename": filename, "can_chat": document_parser.can_parse(filename) or parse_error is None, "document_id": document_id }) except Exception as e: return jsonify({"error": str(e)}), 500 # ============================================================================ # CHAT / LLM API ROUTES # ============================================================================ @app.route("/api/chat/send", methods=["POST"]) @require_auth def chat_send(): """Send message to LLM about current document.""" try: data = request.json user_id = session["user_id"] site_id = data.get("site_id") file_path = data.get("file_path") message = data.get("message") if not site_id or not file_path or not message: return jsonify({"error": "site_id, file_path, and message are required"}), 400 chat_key = f"{user_id}:{site_id}:{file_path}" # Get or initialize conversation if chat_key not in CHAT_CONVERSATIONS: return jsonify({"error": "No document loaded. Please read a file first."}), 400 conversation = CHAT_CONVERSATIONS[chat_key] document_content = conversation["content"] messages = conversation["messages"] # Add user message messages.append({ "role": "user", "content": message }) # Get LLM response try: response = llm_client.chat(messages, context=document_content) # Add assistant response messages.append({ "role": "assistant", "content": response }) return jsonify({ "response": response, "messages": messages }) except Exception as llm_error: # Remove user message if LLM failed messages.pop() return jsonify({"error": f"LLM error: {str(llm_error)}"}), 500 except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/chat/history") @require_auth def chat_history(): """Get chat history for current document.""" try: user_id = session["user_id"] site_id = request.args.get("site_id") file_path = request.args.get("file_path") if not site_id or not file_path: return jsonify({"error": "site_id and file_path are required"}), 400 chat_key = f"{user_id}:{site_id}:{file_path}" if chat_key not in CHAT_CONVERSATIONS: return jsonify({"messages": []}) messages = CHAT_CONVERSATIONS[chat_key]["messages"] return jsonify({"messages": messages}) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/chat/clear", methods=["POST"]) @require_auth def chat_clear(): """Clear chat history for current document.""" try: data = request.json user_id = session["user_id"] site_id = data.get("site_id") file_path = data.get("file_path") if not site_id or not file_path: return jsonify({"error": "site_id and file_path are required"}), 400 chat_key = f"{user_id}:{site_id}:{file_path}" if chat_key in CHAT_CONVERSATIONS: CHAT_CONVERSATIONS[chat_key]["messages"] = [] return jsonify({"success": True}) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/llm/status") @require_auth def llm_status(): """Check if LLM is available.""" try: available = llm_client.is_available() return jsonify({ "available": available, "provider": llm_provider }) except Exception as e: return jsonify({ "available": False, "provider": llm_provider, "error": str(e) }) # ============================================================================ # MULTI-DOCUMENT CHAT / RAG API ROUTES # ============================================================================ @app.route("/api/documents/add", methods=["POST"]) @require_auth def add_document_to_vector_store(): """Add document to vector store with tags for multi-document chat.""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 try: data = request.json user_id = session["user_id"] site_id = data.get("site_id") file_path = data.get("file_path") filename = data.get("filename") content = data.get("content") tags = data.get("tags", []) # e.g., ["HR", "SALES", "Q4-2024"] if not all([site_id, file_path, filename, content]): return jsonify({"error": "site_id, file_path, filename, and content are required"}), 400 # Add document to vector store document_id = vector_store.add_document( user_id=user_id, site_id=site_id, file_path=file_path, filename=filename, content=content, tags=tags, chunk_size=1000, chunk_overlap=200 ) return jsonify({ "success": True, "document_id": document_id, "tags": tags }) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/documents/tags") @require_auth def list_document_tags(): """List all tags for user's documents with counts.""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 try: user_id = session["user_id"] tag_counts = vector_store.list_tags(user_id) return jsonify({ "tags": tag_counts }) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/documents/indexed-sites") @require_auth def get_indexed_sites(): """Get list of site IDs that have indexed documents.""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 try: user_id = session["user_id"] site_ids = vector_store.get_indexed_sites(user_id) return jsonify({ "site_ids": site_ids }) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/documents/by-tags") @require_auth def get_documents_by_tags(): """Get documents filtered by tags.""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 try: user_id = session["user_id"] tags = request.args.get("tags", "").split(",") tags = [t.strip() for t in tags if t.strip()] if not tags: return jsonify({"error": "tags parameter required"}), 400 documents = vector_store.get_documents_by_tags(user_id, tags) return jsonify({ "documents": [ { "document_id": doc.document_id, "filename": doc.filename, "file_path": doc.file_path, "site_id": doc.site_id, "tags": doc.tags, "created_at": doc.created_at, "chunk_count": doc.chunk_count } for doc in documents ] }) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/documents/update-tags", methods=["POST"]) @require_auth def update_document_tags(): """Update tags for a document.""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 try: data = request.json user_id = session["user_id"] document_id = data.get("document_id") tags = data.get("tags", []) if not document_id: return jsonify({"error": "document_id is required"}), 400 vector_store.update_document_tags(document_id, user_id, tags) return jsonify({"success": True}) except ValueError as e: return jsonify({"error": str(e)}), 403 except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/chat/multi", methods=["POST"]) @require_auth def chat_multi_documents(): """Chat with multiple documents using RAG (Retrieval-Augmented Generation).""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 try: data = request.json user_id = session["user_id"] message = data.get("message") tags = data.get("tags", []) # Optional: filter by tags top_k = data.get("top_k", 5) # Number of relevant chunks to retrieve if not message: return jsonify({"error": "message is required"}), 400 # Search for relevant document chunks results = vector_store.search( user_id=user_id, query=message, tags=tags if tags else None, top_k=top_k ) if not results: return jsonify({ "error": "No relevant documents found. Please add documents to the vector store first." }), 400 # Build context from retrieved chunks context_parts = [] for i, result in enumerate(results, 1): chunk = result["chunk"] document = result["document"] similarity = result["similarity"] context_parts.append( f"[Document {i}: {document.filename} (Tags: {', '.join(document.tags)}) - Similarity: {similarity:.2f}]\n" f"{chunk.content}\n" ) context = "\n---\n".join(context_parts) # Get LLM response with RAG context messages = [ { "role": "user", "content": message } ] try: response = llm_client.chat(messages, context=context) # Build source information sources = [ { "filename": result["document"].filename, "tags": result["document"].tags, "similarity": result["similarity"], "chunk_index": result["chunk"].chunk_index } for result in results ] return jsonify({ "response": response, "sources": sources, "context_chunks": len(results) }) except Exception as llm_error: return jsonify({"error": f"LLM error: {str(llm_error)}"}), 500 except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/chat/multi/stream", methods=["POST"]) @require_auth def chat_multi_documents_stream(): """Chat with multiple documents using RAG with streaming response.""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 # Capture request data and session outside of generator data = request.json user_id = session["user_id"] message = data.get("message") tags = data.get("tags", []) top_k = data.get("top_k", 5) def generate(): try: if not message: yield f"data: {json.dumps({'error': 'message is required'})}\n\n" return # Search for relevant document chunks results = vector_store.search( user_id=user_id, query=message, tags=tags if tags else None, top_k=top_k ) if not results: yield f"data: {json.dumps({'error': 'No relevant documents found'})}\n\n" return # Build context from retrieved chunks context_parts = [] for i, result in enumerate(results, 1): chunk = result["chunk"] document = result["document"] similarity = result["similarity"] context_parts.append( f"[Document {i}: {document.filename} (Tags: {', '.join(document.tags)}) - Similarity: {similarity:.2f}]\n" f"{chunk.content}\n" ) context = "\n---\n".join(context_parts) # Build source information sources = [ { "filename": result["document"].filename, "tags": result["document"].tags, "similarity": result["similarity"], "chunk_index": result["chunk"].chunk_index } for result in results ] # Send sources first yield f"data: {json.dumps({'sources': sources})}\n\n" # Get LLM response with RAG context - streaming messages = [{"role": "user", "content": message}] try: # Check if client supports streaming if hasattr(llm_client, 'chat_stream'): for chunk in llm_client.chat_stream(messages, context=context): yield f"data: {json.dumps({'chunk': chunk})}\n\n" else: # Fallback to non-streaming response = llm_client.chat(messages, context=context) yield f"data: {json.dumps({'chunk': response})}\n\n" yield f"data: {json.dumps({'done': True})}\n\n" except Exception as llm_error: yield f"data: {json.dumps({'error': f'LLM error: {str(llm_error)}'})}\n\n" except Exception as e: yield f"data: {json.dumps({'error': str(e)})}\n\n" return app.response_class(generate(), mimetype='text/event-stream') # ============================================================================ # BACKGROUND INDEXING API ROUTES # ============================================================================ @app.route("/api/indexing/start", methods=["POST"]) @require_auth @require_config def start_site_indexing(): """Start background indexing for a SharePoint site.""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 try: data = request.json user_id = session["user_id"] site_id = data.get("site_id") site_name = data.get("site_name", "Unknown Site") connection_id = data.get("connection_id") path = data.get("path", "") tags = data.get("tags", []) # Tags for all documents in this site if not site_id or not connection_id: return jsonify({"error": "site_id and connection_id are required"}), 400 # Generate job ID import secrets job_id = f"idx_{user_id}_{site_id}_{secrets.token_hex(4)}" # Get connector conn = get_or_create_connector() if not conn: return jsonify({"error": "Not configured"}), 400 # Start indexing in background indexer = get_indexer() job = indexer.start_indexing( job_id=job_id, site_id=site_id, site_name=site_name, connection_id=connection_id, user_id=user_id, connector=conn, vector_store=vector_store, document_parser=document_parser, path=path, tags=tags ) return jsonify({ "success": True, "job_id": job_id, "message": f"Started indexing {site_name}" }) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/indexing/status/") @require_auth def get_indexing_status(job_id): """Get the status of a background indexing job.""" try: indexer = get_indexer() status = indexer.get_job_status(job_id) if not status: return jsonify({"error": "Job not found"}), 404 return jsonify(status) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/indexing/cancel/", methods=["POST"]) @require_auth def cancel_indexing(job_id): """Cancel a running indexing job.""" try: indexer = get_indexer() cancelled = indexer.cancel_job(job_id) if cancelled: return jsonify({"success": True, "message": "Job cancelled"}) else: return jsonify({"error": "Job not found or not running"}), 404 except Exception as e: return jsonify({"error": str(e)}), 500 # ============================================================================ # DEVELOPMENT UTILITIES # ============================================================================ @app.route("/dev/tables") def dev_list_tables(): """Development endpoint to list DynamoDB tables.""" try: import boto3 dynamodb = boto3.client('dynamodb', region_name='ap-southeast-2', endpoint_url='http://localhost:8000') tables = dynamodb.list_tables() return jsonify({"tables": tables.get('TableNames', [])}) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/dev/clear-session", methods=["POST"]) def dev_clear_session(): """Development endpoint to clear session.""" session.clear() return jsonify({"success": True}) # ============================================================================ # HEALTH CHECK # ============================================================================ @app.route("/health") def health(): """Health check endpoint.""" return jsonify({"status": "healthy", "mode": "development"}), 200 # ============================================================================ # TOOTHFAIRYAI AGENT SEARCH ENDPOINT # ============================================================================ @app.route("/api/search/chunks", methods=["POST"]) def api_tool_search(): """Pure vector search endpoint specifically for the ToothFairyAI Agent.""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 try: data = request.json query = data.get("query") if not query: return jsonify({"error": "Missing 'query' parameter"}), 400 # For local development, grab the first active user ID from memory # In production, you would authenticate this via the 'authorisationType' token user_id = next(iter(USER_CONFIGS.keys()), None) if not user_id: return jsonify({"error": "No active user session found to query against"}), 400 # Search the local PostgreSQL vector store results = vector_store.search( user_id=user_id, query=query, top_k=5 ) # Format a clean JSON response for the AI Agent to read chunks = [ { "source_file": r["document"].filename, "relevance_score": round(r["similarity"], 2), "text_content": r["chunk"].content } for r in results ] return jsonify({"results": chunks}) except Exception as e: return jsonify({"error": str(e)}), 500 # ============================================================================ # ERROR HANDLERS # ============================================================================ @app.errorhandler(404) def not_found(e): return jsonify({"error": "Not found"}), 404 @app.errorhandler(500) def internal_error(e): return jsonify({"error": "Internal server error"}), 500 # ============================================================================ # MAIN # ============================================================================ if __name__ == "__main__": port = int(os.getenv("PORT", 5001)) print("āœ… SharePoint Connector starting (Development Mode)...") print("šŸ”§ Using LOCAL DynamoDB at http://localhost:8000") print("šŸ“ Users will enter their Azure credentials through the web interface") print("šŸ·ļø Table Prefix: dev_") print("\nāš ļø DEVELOPMENT MODE - DO NOT USE IN PRODUCTION") print("\nšŸ“‹ Prerequisites:") print(" 1. Start DynamoDB Local:") print(" docker run -p 8000:8000 amazon/dynamodb-local") print(f"\n 2. Open http://localhost:{port} in your browser") print("\nšŸ› ļø Development endpoints:") print(" GET /dev/tables - List DynamoDB tables") print(" POST /dev/clear-session - Clear session") print() # Run Flask app with debug mode enabled app.run( debug=True, # Enable debug mode for development host="0.0.0.0", port=port )