""" SharePoint Connector Web Application - Production Version Designed for AWS ECS deployment with IAM task role for DynamoDB access. Users input their Azure credentials through the web interface. """ import os import secrets import json from flask import Flask, render_template, redirect, request, session, jsonify from functools import wraps from saas_connector_dynamodb import DynamoDBSharePointConnector, SecureSharePointClient from cryptography.fernet import Fernet from llm_client import create_llm_client from document_parser import DocumentParser, get_file_info from vector_store_postgres import PostgreSQLVectorStore, create_embedding_provider from background_indexer import get_indexer from storage.credentials_storage import get_credentials_storage # Initialize Flask app app = Flask(__name__) app.secret_key = os.getenv("FLASK_SECRET_KEY", secrets.token_urlsafe(32)) # Global connector (will be initialized with user credentials) connector = None # Persistent credentials storage credentials_storage = get_credentials_storage() # In-memory cache for user configs (loaded from persistent storage) USER_CONFIGS = {} # Chat conversation storage (in-memory, per user per document) CHAT_CONVERSATIONS = {} # Initialize LLM client # Default to Ollama, but can be swapped using environment variables llm_provider = os.getenv("LLM_PROVIDER", "ollama") llm_client = create_llm_client(llm_provider) # Initialize document parser document_parser = DocumentParser() # Initialize vector store for multi-document chat with RAG (PostgreSQL + pgvector) embedding_provider_type = os.getenv("EMBEDDING_PROVIDER", "ollama") try: embedding_provider = create_embedding_provider(embedding_provider_type) vector_store = PostgreSQLVectorStore( embedding_provider=embedding_provider, table_prefix=os.getenv("TABLE_PREFIX", "prod_") ) print("āœ… PostgreSQL vector store initialized") except Exception as e: print(f"āš ļø Warning: Could not initialize vector store: {e}") print(" Multi-document chat will not be available") print(" Make sure PostgreSQL with pgvector extension is running") vector_store = None def get_or_create_connector(): """Get or create connector with user's credentials.""" global connector user_id = session.get("user_id") if not user_id: # Create demo user ID user_id = "user_" + secrets.token_hex(8) session["user_id"] = user_id # Check if user has configured credentials if user_id not in USER_CONFIGS: return None config = USER_CONFIGS[user_id] # Create connector with user's credentials if not connector or getattr(connector, '_user_id', None) != user_id: connector = DynamoDBSharePointConnector( client_id=config["client_id"], client_secret=config["client_secret"], redirect_uri=request.url_root.rstrip('/') + '/sharepoint/callback', encryption_key=config["encryption_key"], aws_region=os.getenv("AWS_REGION", "ap-southeast-2"), # Sydney region dynamodb_endpoint=None, # Use real AWS DynamoDB table_prefix=os.getenv("TABLE_PREFIX", "prod_"), tenant_id=config["tenant_id"] ) connector._user_id = user_id return connector # Authentication decorator def require_auth(f): """Decorator to require authentication.""" @wraps(f) def decorated_function(*args, **kwargs): if "user_id" not in session: session["user_id"] = "user_" + secrets.token_hex(8) return f(*args, **kwargs) return decorated_function def require_config(f): """Decorator to require configuration.""" @wraps(f) def decorated_function(*args, **kwargs): conn = get_or_create_connector() if not conn: return jsonify({"error": "Not configured"}), 400 return f(*args, **kwargs) return decorated_function # ============================================================================ # WEB UI ROUTES # ============================================================================ @app.route("/") @require_auth def index(): """Main page with web UI.""" return render_template("index.html") # ============================================================================ # CONFIGURATION API # ============================================================================ @app.route("/api/config/check") @require_auth def check_config(): """Check if user has configured credentials.""" user_id = session.get("user_id") # Check in-memory cache first if user_id in USER_CONFIGS: return jsonify({"configured": True}) # Check persistent storage stored_config = credentials_storage.get_config(user_id) if stored_config: # Load into memory cache USER_CONFIGS[user_id] = stored_config return jsonify({"configured": True}) return jsonify({"configured": False}) @app.route("/api/config/save", methods=["POST"]) @require_auth def save_config(): """Save user's Azure credentials.""" try: data = request.json client_id = data.get("client_id") client_secret = data.get("client_secret") tenant_id = data.get("tenant_id", "common") if not client_id or not client_secret: return jsonify({"error": "Missing client_id or client_secret"}), 400 user_id = session["user_id"] # Generate encryption key for this user (or reuse existing) existing_config = credentials_storage.get_config(user_id) encryption_key = existing_config.get("encryption_key") if existing_config else Fernet.generate_key().decode() # Store configuration config = { "client_id": client_id, "client_secret": client_secret, "tenant_id": tenant_id, "encryption_key": encryption_key } # Save to persistent storage credentials_storage.save_config(user_id, config) # Update in-memory cache USER_CONFIGS[user_id] = config return jsonify({"success": True}) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/config/reset", methods=["POST"]) @require_auth def reset_config(): """Reset user's configuration.""" user_id = session.get("user_id") # Delete from persistent storage credentials_storage.delete_config(user_id) # Delete from in-memory cache if user_id in USER_CONFIGS: del USER_CONFIGS[user_id] return jsonify({"success": True}) # ============================================================================ # SHAREPOINT OAUTH ROUTES # ============================================================================ @app.route("/sharepoint/connect") @require_auth @require_config def connect_sharepoint(): """Initiate SharePoint connection.""" conn = get_or_create_connector() user_id = session["user_id"] auth_url = conn.initiate_connection( user_id=user_id, organization_id="default_org", return_url="/" ) return redirect(auth_url) @app.route("/sharepoint/callback") def sharepoint_callback(): """OAuth callback endpoint.""" if "error" in request.args: return render_template("error.html", error=request.args.get("error_description", request.args["error"])) auth_code = request.args.get("code") state = request.args.get("state") if not auth_code or not state: return render_template("error.html", error="Invalid callback - missing code or state") try: conn = get_or_create_connector() if not conn: return render_template("error.html", error="Configuration not found") connection_info = conn.complete_connection( auth_code=auth_code, state=state, ip_address=request.remote_addr, user_agent=request.headers.get("User-Agent") ) # Store connection ID in session session["sharepoint_connection_id"] = connection_info.id return redirect("/") except Exception as e: return render_template("error.html", error=str(e)) # ============================================================================ # API ROUTES # ============================================================================ @app.route("/api/sharepoint/connections") @require_auth @require_config def list_connections(): """List user's SharePoint connections.""" conn = get_or_create_connector() user_id = session["user_id"] try: connections = conn.list_connections(user_id) return jsonify({ "connections": [ { "id": c.id, "name": c.connection_name, "created_at": c.created_at.isoformat(), "last_used_at": c.last_used_at.isoformat() if c.last_used_at else None } for c in connections ] }) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/sharepoint/connections//disconnect", methods=["POST"]) @require_auth @require_config def disconnect_sharepoint(connection_id): """Disconnect SharePoint.""" conn = get_or_create_connector() user_id = session["user_id"] try: conn.disconnect( connection_id=connection_id, user_id=user_id, ip_address=request.remote_addr ) # Clear session if "sharepoint_connection_id" in session: del session["sharepoint_connection_id"] return jsonify({"success": True}) except Exception as e: return jsonify({"error": str(e)}), 400 @app.route("/api/sharepoint//sites") @require_auth @require_config def get_sites(connection_id): """Get SharePoint sites.""" conn = get_or_create_connector() user_id = session["user_id"] try: client = SecureSharePointClient(conn, connection_id, user_id) sites = client.list_sites() return jsonify({"sites": sites}) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/sharepoint//files") @require_auth @require_config def get_files(connection_id): """Get files from SharePoint.""" conn = get_or_create_connector() user_id = session["user_id"] site_id = request.args.get("site_id") path = request.args.get("path", "") if not site_id: return jsonify({"error": "site_id is required"}), 400 try: client = SecureSharePointClient(conn, connection_id, user_id) files = client.list_files(site_id, path) return jsonify({"files": files}) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/sharepoint//read") @require_auth @require_config def read_file(connection_id): """Read file content with automatic parsing for different file types.""" conn = get_or_create_connector() user_id = session["user_id"] site_id = request.args.get("site_id") file_path = request.args.get("file_path") if not site_id or not file_path: return jsonify({"error": "site_id and file_path are required"}), 400 try: client = SecureSharePointClient(conn, connection_id, user_id) # Get filename from path filename = file_path.split('/')[-1] # Read file as binary binary_content = client.read_file(site_id, file_path, as_text=False) # Try to parse the file content = None parse_error = None if document_parser.can_parse(filename): try: content = document_parser.parse(binary_content, filename) except Exception as parse_err: parse_error = str(parse_err) # Fallback to text decoding try: content = binary_content.decode('utf-8', errors='ignore') except: content = f"[Unable to parse file: {parse_error}]" else: # Unsupported file type - try text decode try: content = binary_content.decode('utf-8', errors='ignore') except: # Get file info for context file_info = get_file_info(filename, len(binary_content)) content = f"[{file_info['category'].title()} file - {file_info['size_formatted']} - Preview not available for {file_info['extension']} files]" # Store file context for chat chat_key = f"{user_id}:{site_id}:{file_path}" if chat_key not in CHAT_CONVERSATIONS: CHAT_CONVERSATIONS[chat_key] = { "content": content, "messages": [], "filename": filename } else: CHAT_CONVERSATIONS[chat_key]["content"] = content CHAT_CONVERSATIONS[chat_key]["filename"] = filename # Optionally add to vector store if available and content is parseable # This allows the document to be used in multi-document chat document_id = None if vector_store and content and (document_parser.can_parse(filename) or parse_error is None): try: # Don't add error messages to vector store if not content.startswith("[") or not content.endswith("]"): document_id = vector_store.add_document( user_id=user_id, site_id=site_id, file_path=file_path, filename=filename, content=content, tags=[], # No tags by default, user can add later chunk_size=1000, chunk_overlap=200 ) except Exception as vs_error: # Don't fail the request if vector store fails print(f"Warning: Could not add to vector store: {vs_error}") return jsonify({ "content": content, "filename": filename, "can_chat": document_parser.can_parse(filename) or parse_error is None, "document_id": document_id }) except Exception as e: return jsonify({"error": str(e)}), 500 # ============================================================================ # CHAT / LLM API ROUTES # ============================================================================ @app.route("/api/chat/send", methods=["POST"]) @require_auth def chat_send(): """Send message to LLM about current document.""" try: data = request.json user_id = session["user_id"] site_id = data.get("site_id") file_path = data.get("file_path") message = data.get("message") if not site_id or not file_path or not message: return jsonify({"error": "site_id, file_path, and message are required"}), 400 chat_key = f"{user_id}:{site_id}:{file_path}" # Get or initialize conversation if chat_key not in CHAT_CONVERSATIONS: return jsonify({"error": "No document loaded. Please read a file first."}), 400 conversation = CHAT_CONVERSATIONS[chat_key] document_content = conversation["content"] messages = conversation["messages"] # Add user message messages.append({ "role": "user", "content": message }) # Get LLM response try: response = llm_client.chat(messages, context=document_content) # Add assistant response messages.append({ "role": "assistant", "content": response }) return jsonify({ "response": response, "messages": messages }) except Exception as llm_error: # Remove user message if LLM failed messages.pop() return jsonify({"error": f"LLM error: {str(llm_error)}"}), 500 except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/chat/history") @require_auth def chat_history(): """Get chat history for current document.""" try: user_id = session["user_id"] site_id = request.args.get("site_id") file_path = request.args.get("file_path") if not site_id or not file_path: return jsonify({"error": "site_id and file_path are required"}), 400 chat_key = f"{user_id}:{site_id}:{file_path}" if chat_key not in CHAT_CONVERSATIONS: return jsonify({"messages": []}) messages = CHAT_CONVERSATIONS[chat_key]["messages"] return jsonify({"messages": messages}) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/chat/clear", methods=["POST"]) @require_auth def chat_clear(): """Clear chat history for current document.""" try: data = request.json user_id = session["user_id"] site_id = data.get("site_id") file_path = data.get("file_path") if not site_id or not file_path: return jsonify({"error": "site_id and file_path are required"}), 400 chat_key = f"{user_id}:{site_id}:{file_path}" if chat_key in CHAT_CONVERSATIONS: CHAT_CONVERSATIONS[chat_key]["messages"] = [] return jsonify({"success": True}) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/llm/status") @require_auth def llm_status(): """Check if LLM is available.""" try: available = llm_client.is_available() return jsonify({ "available": available, "provider": llm_provider }) except Exception as e: return jsonify({ "available": False, "provider": llm_provider, "error": str(e) }) # ============================================================================ # MULTI-DOCUMENT CHAT / RAG API ROUTES # ============================================================================ @app.route("/api/documents/add", methods=["POST"]) @require_auth def add_document_to_vector_store(): """Add document to vector store with tags for multi-document chat.""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 try: data = request.json user_id = session["user_id"] site_id = data.get("site_id") file_path = data.get("file_path") filename = data.get("filename") content = data.get("content") tags = data.get("tags", []) # e.g., ["HR", "SALES", "Q4-2024"] if not all([site_id, file_path, filename, content]): return jsonify({"error": "site_id, file_path, filename, and content are required"}), 400 # Add document to vector store document_id = vector_store.add_document( user_id=user_id, site_id=site_id, file_path=file_path, filename=filename, content=content, tags=tags, chunk_size=1000, chunk_overlap=200 ) return jsonify({ "success": True, "document_id": document_id, "tags": tags }) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/documents/tags") @require_auth def list_document_tags(): """List all tags for user's documents with counts.""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 try: user_id = session["user_id"] tag_counts = vector_store.list_tags(user_id) return jsonify({ "tags": tag_counts }) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/documents/indexed-sites") @require_auth def get_indexed_sites(): """Get list of site IDs that have indexed documents.""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 try: user_id = session["user_id"] site_ids = vector_store.get_indexed_sites(user_id) return jsonify({ "site_ids": site_ids }) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/documents/by-tags") @require_auth def get_documents_by_tags(): """Get documents filtered by tags.""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 try: user_id = session["user_id"] tags = request.args.get("tags", "").split(",") tags = [t.strip() for t in tags if t.strip()] if not tags: return jsonify({"error": "tags parameter required"}), 400 documents = vector_store.get_documents_by_tags(user_id, tags) return jsonify({ "documents": [ { "document_id": doc.document_id, "filename": doc.filename, "file_path": doc.file_path, "site_id": doc.site_id, "tags": doc.tags, "created_at": doc.created_at, "chunk_count": doc.chunk_count } for doc in documents ] }) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/documents/update-tags", methods=["POST"]) @require_auth def update_document_tags(): """Update tags for a document.""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 try: data = request.json user_id = session["user_id"] document_id = data.get("document_id") tags = data.get("tags", []) if not document_id: return jsonify({"error": "document_id is required"}), 400 vector_store.update_document_tags(document_id, user_id, tags) return jsonify({"success": True}) except ValueError as e: return jsonify({"error": str(e)}), 403 except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/chat/multi", methods=["POST"]) @require_auth def chat_multi_documents(): """Chat with multiple documents using RAG (Retrieval-Augmented Generation).""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 try: data = request.json user_id = session["user_id"] message = data.get("message") tags = data.get("tags", []) # Optional: filter by tags top_k = data.get("top_k", 5) # Number of relevant chunks to retrieve if not message: return jsonify({"error": "message is required"}), 400 # Search for relevant document chunks results = vector_store.search( user_id=user_id, query=message, tags=tags if tags else None, top_k=top_k ) if not results: return jsonify({ "error": "No relevant documents found. Please add documents to the vector store first." }), 400 # Build context from retrieved chunks context_parts = [] for i, result in enumerate(results, 1): chunk = result["chunk"] document = result["document"] similarity = result["similarity"] context_parts.append( f"[Document {i}: {document.filename} (Tags: {', '.join(document.tags)}) - Similarity: {similarity:.2f}]\n" f"{chunk.content}\n" ) context = "\n---\n".join(context_parts) # Get LLM response with RAG context messages = [ { "role": "user", "content": message } ] try: response = llm_client.chat(messages, context=context) # Build source information sources = [ { "filename": result["document"].filename, "tags": result["document"].tags, "similarity": result["similarity"], "chunk_index": result["chunk"].chunk_index } for result in results ] return jsonify({ "response": response, "sources": sources, "context_chunks": len(results) }) except Exception as llm_error: return jsonify({"error": f"LLM error: {str(llm_error)}"}), 500 except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/chat/multi/stream", methods=["POST"]) @require_auth def chat_multi_documents_stream(): """Chat with multiple documents using RAG with streaming response.""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 # Capture request data and session outside of generator data = request.json user_id = session["user_id"] message = data.get("message") tags = data.get("tags", []) top_k = data.get("top_k", 5) def generate(): try: if not message: yield f"data: {json.dumps({'error': 'message is required'})}\n\n" return # Search for relevant document chunks results = vector_store.search( user_id=user_id, query=message, tags=tags if tags else None, top_k=top_k ) if not results: yield f"data: {json.dumps({'error': 'No relevant documents found'})}\n\n" return # Build context from retrieved chunks context_parts = [] for i, result in enumerate(results, 1): chunk = result["chunk"] document = result["document"] similarity = result["similarity"] context_parts.append( f"[Document {i}: {document.filename} (Tags: {', '.join(document.tags)}) - Similarity: {similarity:.2f}]\n" f"{chunk.content}\n" ) context = "\n---\n".join(context_parts) # Build source information sources = [ { "filename": result["document"].filename, "tags": result["document"].tags, "similarity": result["similarity"], "chunk_index": result["chunk"].chunk_index } for result in results ] # Send sources first yield f"data: {json.dumps({'sources': sources})}\n\n" # Get LLM response with RAG context - streaming messages = [{"role": "user", "content": message}] try: # Check if client supports streaming if hasattr(llm_client, 'chat_stream'): for chunk in llm_client.chat_stream(messages, context=context): yield f"data: {json.dumps({'chunk': chunk})}\n\n" else: # Fallback to non-streaming response = llm_client.chat(messages, context=context) yield f"data: {json.dumps({'chunk': response})}\n\n" yield f"data: {json.dumps({'done': True})}\n\n" except Exception as llm_error: yield f"data: {json.dumps({'error': f'LLM error: {str(llm_error)}'})}\n\n" except Exception as e: yield f"data: {json.dumps({'error': str(e)})}\n\n" return app.response_class(generate(), mimetype='text/event-stream') # ============================================================================ # BACKGROUND INDEXING API ROUTES # ============================================================================ @app.route("/api/indexing/start", methods=["POST"]) @require_auth @require_config def start_site_indexing(): """Start background indexing for a SharePoint site.""" if not vector_store: return jsonify({"error": "Vector store not available"}), 503 try: data = request.json user_id = session["user_id"] site_id = data.get("site_id") site_name = data.get("site_name", "Unknown Site") connection_id = data.get("connection_id") path = data.get("path", "") tags = data.get("tags", []) # Tags for all documents in this site if not site_id or not connection_id: return jsonify({"error": "site_id and connection_id are required"}), 400 # Generate job ID import secrets job_id = f"idx_{user_id}_{site_id}_{secrets.token_hex(4)}" # Get connector conn = get_or_create_connector() if not conn: return jsonify({"error": "Not configured"}), 400 # Start indexing in background indexer = get_indexer() job = indexer.start_indexing( job_id=job_id, site_id=site_id, site_name=site_name, connection_id=connection_id, user_id=user_id, connector=conn, vector_store=vector_store, document_parser=document_parser, path=path, tags=tags ) return jsonify({ "success": True, "job_id": job_id, "message": f"Started indexing {site_name}" }) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/indexing/status/") @require_auth def get_indexing_status(job_id): """Get the status of a background indexing job.""" try: indexer = get_indexer() status = indexer.get_job_status(job_id) if not status: return jsonify({"error": "Job not found"}), 404 return jsonify(status) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/indexing/cancel/", methods=["POST"]) @require_auth def cancel_indexing(job_id): """Cancel a running indexing job.""" try: indexer = get_indexer() cancelled = indexer.cancel_job(job_id) if cancelled: return jsonify({"success": True, "message": "Job cancelled"}) else: return jsonify({"error": "Job not found or not running"}), 404 except Exception as e: return jsonify({"error": str(e)}), 500 # ============================================================================ # HEALTH CHECK (for ECS) # ============================================================================ @app.route("/health") def health(): """Health check endpoint for ECS.""" return jsonify({"status": "healthy"}), 200 # ============================================================================ # ERROR HANDLERS # ============================================================================ @app.errorhandler(404) def not_found(e): return jsonify({"error": "Not found"}), 404 @app.errorhandler(500) def internal_error(e): return jsonify({"error": "Internal server error"}), 500 # ============================================================================ # MAIN # ============================================================================ if __name__ == "__main__": print("āœ… SharePoint Connector starting (Production Mode)...") print("šŸ“ Users will enter their Azure credentials through the web interface") print(f"šŸŒŽ AWS Region: {os.getenv('AWS_REGION', 'ap-southeast-2')}") print(f"šŸ·ļø Table Prefix: {os.getenv('TABLE_PREFIX', 'prod_')}") print(f"šŸ”’ Using IAM task role for DynamoDB access") print("\nšŸš€ Open http://localhost:8000 in your browser\n") print("šŸ“‹ Users will need:") print(" - Azure App Registration Client ID") print(" - Azure App Registration Client Secret") print(" - Tenant ID (or use 'common' for multi-tenant)") print() # Run Flask app (for local testing, in ECS use gunicorn) app.run( debug=False, # Never True in production host="0.0.0.0", port=int(os.getenv("PORT", 8000)) )