Some checks failed
CI - SharePoint Plugin with SonarQube / Test and SonarQube Analysis (push) Has been cancelled
1015 lines
32 KiB
Python
1015 lines
32 KiB
Python
"""
|
|
SharePoint Connector Web Application - Production Version
|
|
|
|
Designed for AWS ECS deployment with IAM task role for DynamoDB access.
|
|
Users input their Azure credentials through the web interface.
|
|
"""
|
|
|
|
import os
|
|
import secrets
|
|
import json
|
|
from flask import Flask, render_template, redirect, request, session, jsonify
|
|
from functools import wraps
|
|
from saas_connector_dynamodb import DynamoDBSharePointConnector, SecureSharePointClient
|
|
from cryptography.fernet import Fernet
|
|
from llm_client import create_llm_client
|
|
from document_parser import DocumentParser, get_file_info
|
|
from vector_store_postgres import PostgreSQLVectorStore, create_embedding_provider
|
|
from background_indexer import get_indexer
|
|
from storage.credentials_storage import get_credentials_storage
|
|
|
|
|
|
# Initialize Flask app
|
|
app = Flask(__name__)
|
|
app.secret_key = os.getenv("FLASK_SECRET_KEY", secrets.token_urlsafe(32))
|
|
|
|
# Global connector (will be initialized with user credentials)
|
|
connector = None
|
|
|
|
# Persistent credentials storage
|
|
credentials_storage = get_credentials_storage()
|
|
|
|
# In-memory cache for user configs (loaded from persistent storage)
|
|
USER_CONFIGS = {}
|
|
|
|
# Chat conversation storage (in-memory, per user per document)
|
|
CHAT_CONVERSATIONS = {}
|
|
|
|
# Initialize LLM client
|
|
# Default to Ollama, but can be swapped using environment variables
|
|
llm_provider = os.getenv("LLM_PROVIDER", "ollama")
|
|
llm_client = create_llm_client(llm_provider)
|
|
|
|
# Initialize document parser
|
|
document_parser = DocumentParser()
|
|
|
|
# Initialize vector store for multi-document chat with RAG (PostgreSQL + pgvector)
|
|
embedding_provider_type = os.getenv("EMBEDDING_PROVIDER", "ollama")
|
|
try:
|
|
embedding_provider = create_embedding_provider(embedding_provider_type)
|
|
vector_store = PostgreSQLVectorStore(
|
|
embedding_provider=embedding_provider,
|
|
table_prefix=os.getenv("TABLE_PREFIX", "prod_")
|
|
)
|
|
print("✅ PostgreSQL vector store initialized")
|
|
except Exception as e:
|
|
print(f"⚠️ Warning: Could not initialize vector store: {e}")
|
|
print(" Multi-document chat will not be available")
|
|
print(" Make sure PostgreSQL with pgvector extension is running")
|
|
vector_store = None
|
|
|
|
|
|
def get_or_create_connector():
|
|
"""Get or create connector with user's credentials."""
|
|
global connector
|
|
|
|
user_id = session.get("user_id")
|
|
if not user_id:
|
|
# Create demo user ID
|
|
user_id = "user_" + secrets.token_hex(8)
|
|
session["user_id"] = user_id
|
|
|
|
# Check if user has configured credentials
|
|
if user_id not in USER_CONFIGS:
|
|
return None
|
|
|
|
config = USER_CONFIGS[user_id]
|
|
|
|
# Create connector with user's credentials
|
|
if not connector or getattr(connector, '_user_id', None) != user_id:
|
|
connector = DynamoDBSharePointConnector(
|
|
client_id=config["client_id"],
|
|
client_secret=config["client_secret"],
|
|
redirect_uri=request.url_root.rstrip('/') + '/sharepoint/callback',
|
|
encryption_key=config["encryption_key"],
|
|
aws_region=os.getenv("AWS_REGION", "ap-southeast-2"), # Sydney region
|
|
dynamodb_endpoint=None, # Use real AWS DynamoDB
|
|
table_prefix=os.getenv("TABLE_PREFIX", "prod_"),
|
|
tenant_id=config["tenant_id"]
|
|
)
|
|
connector._user_id = user_id
|
|
|
|
return connector
|
|
|
|
|
|
# Authentication decorator
|
|
def require_auth(f):
|
|
"""Decorator to require authentication."""
|
|
@wraps(f)
|
|
def decorated_function(*args, **kwargs):
|
|
if "user_id" not in session:
|
|
session["user_id"] = "user_" + secrets.token_hex(8)
|
|
return f(*args, **kwargs)
|
|
return decorated_function
|
|
|
|
|
|
def require_config(f):
|
|
"""Decorator to require configuration."""
|
|
@wraps(f)
|
|
def decorated_function(*args, **kwargs):
|
|
conn = get_or_create_connector()
|
|
if not conn:
|
|
return jsonify({"error": "Not configured"}), 400
|
|
return f(*args, **kwargs)
|
|
return decorated_function
|
|
|
|
|
|
# ============================================================================
|
|
# WEB UI ROUTES
|
|
# ============================================================================
|
|
|
|
@app.route("/")
|
|
@require_auth
|
|
def index():
|
|
"""Main page with web UI."""
|
|
return render_template("index.html")
|
|
|
|
|
|
# ============================================================================
|
|
# CONFIGURATION API
|
|
# ============================================================================
|
|
|
|
@app.route("/api/config/check")
|
|
@require_auth
|
|
def check_config():
|
|
"""Check if user has configured credentials."""
|
|
user_id = session.get("user_id")
|
|
|
|
# Check in-memory cache first
|
|
if user_id in USER_CONFIGS:
|
|
return jsonify({"configured": True})
|
|
|
|
# Check persistent storage
|
|
stored_config = credentials_storage.get_config(user_id)
|
|
if stored_config:
|
|
# Load into memory cache
|
|
USER_CONFIGS[user_id] = stored_config
|
|
return jsonify({"configured": True})
|
|
|
|
return jsonify({"configured": False})
|
|
|
|
|
|
@app.route("/api/config/save", methods=["POST"])
|
|
@require_auth
|
|
def save_config():
|
|
"""Save user's Azure credentials."""
|
|
try:
|
|
data = request.json
|
|
client_id = data.get("client_id")
|
|
client_secret = data.get("client_secret")
|
|
tenant_id = data.get("tenant_id", "common")
|
|
|
|
if not client_id or not client_secret:
|
|
return jsonify({"error": "Missing client_id or client_secret"}), 400
|
|
|
|
user_id = session["user_id"]
|
|
|
|
# Generate encryption key for this user (or reuse existing)
|
|
existing_config = credentials_storage.get_config(user_id)
|
|
encryption_key = existing_config.get("encryption_key") if existing_config else Fernet.generate_key().decode()
|
|
|
|
# Store configuration
|
|
config = {
|
|
"client_id": client_id,
|
|
"client_secret": client_secret,
|
|
"tenant_id": tenant_id,
|
|
"encryption_key": encryption_key
|
|
}
|
|
|
|
# Save to persistent storage
|
|
credentials_storage.save_config(user_id, config)
|
|
|
|
# Update in-memory cache
|
|
USER_CONFIGS[user_id] = config
|
|
|
|
return jsonify({"success": True})
|
|
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
@app.route("/api/config/reset", methods=["POST"])
|
|
@require_auth
|
|
def reset_config():
|
|
"""Reset user's configuration."""
|
|
user_id = session.get("user_id")
|
|
|
|
# Delete from persistent storage
|
|
credentials_storage.delete_config(user_id)
|
|
|
|
# Delete from in-memory cache
|
|
if user_id in USER_CONFIGS:
|
|
del USER_CONFIGS[user_id]
|
|
|
|
return jsonify({"success": True})
|
|
|
|
|
|
# ============================================================================
|
|
# SHAREPOINT OAUTH ROUTES
|
|
# ============================================================================
|
|
|
|
@app.route("/sharepoint/connect")
|
|
@require_auth
|
|
@require_config
|
|
def connect_sharepoint():
|
|
"""Initiate SharePoint connection."""
|
|
conn = get_or_create_connector()
|
|
user_id = session["user_id"]
|
|
|
|
auth_url = conn.initiate_connection(
|
|
user_id=user_id,
|
|
organization_id="default_org",
|
|
return_url="/"
|
|
)
|
|
|
|
return redirect(auth_url)
|
|
|
|
|
|
@app.route("/sharepoint/callback")
|
|
def sharepoint_callback():
|
|
"""OAuth callback endpoint."""
|
|
if "error" in request.args:
|
|
return render_template("error.html",
|
|
error=request.args.get("error_description", request.args["error"]))
|
|
|
|
auth_code = request.args.get("code")
|
|
state = request.args.get("state")
|
|
|
|
if not auth_code or not state:
|
|
return render_template("error.html", error="Invalid callback - missing code or state")
|
|
|
|
try:
|
|
conn = get_or_create_connector()
|
|
if not conn:
|
|
return render_template("error.html", error="Configuration not found")
|
|
|
|
connection_info = conn.complete_connection(
|
|
auth_code=auth_code,
|
|
state=state,
|
|
ip_address=request.remote_addr,
|
|
user_agent=request.headers.get("User-Agent")
|
|
)
|
|
|
|
# Store connection ID in session
|
|
session["sharepoint_connection_id"] = connection_info.id
|
|
|
|
return redirect("/")
|
|
|
|
except Exception as e:
|
|
return render_template("error.html", error=str(e))
|
|
|
|
|
|
# ============================================================================
|
|
# API ROUTES
|
|
# ============================================================================
|
|
|
|
@app.route("/api/sharepoint/connections")
|
|
@require_auth
|
|
@require_config
|
|
def list_connections():
|
|
"""List user's SharePoint connections."""
|
|
conn = get_or_create_connector()
|
|
user_id = session["user_id"]
|
|
|
|
try:
|
|
connections = conn.list_connections(user_id)
|
|
|
|
return jsonify({
|
|
"connections": [
|
|
{
|
|
"id": c.id,
|
|
"name": c.connection_name,
|
|
"created_at": c.created_at.isoformat(),
|
|
"last_used_at": c.last_used_at.isoformat() if c.last_used_at else None
|
|
}
|
|
for c in connections
|
|
]
|
|
})
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
@app.route("/api/sharepoint/connections/<connection_id>/disconnect", methods=["POST"])
|
|
@require_auth
|
|
@require_config
|
|
def disconnect_sharepoint(connection_id):
|
|
"""Disconnect SharePoint."""
|
|
conn = get_or_create_connector()
|
|
user_id = session["user_id"]
|
|
|
|
try:
|
|
conn.disconnect(
|
|
connection_id=connection_id,
|
|
user_id=user_id,
|
|
ip_address=request.remote_addr
|
|
)
|
|
|
|
# Clear session
|
|
if "sharepoint_connection_id" in session:
|
|
del session["sharepoint_connection_id"]
|
|
|
|
return jsonify({"success": True})
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 400
|
|
|
|
|
|
@app.route("/api/sharepoint/<connection_id>/sites")
|
|
@require_auth
|
|
@require_config
|
|
def get_sites(connection_id):
|
|
"""Get SharePoint sites."""
|
|
conn = get_or_create_connector()
|
|
user_id = session["user_id"]
|
|
|
|
try:
|
|
client = SecureSharePointClient(conn, connection_id, user_id)
|
|
sites = client.list_sites()
|
|
|
|
return jsonify({"sites": sites})
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
@app.route("/api/sharepoint/<connection_id>/files")
|
|
@require_auth
|
|
@require_config
|
|
def get_files(connection_id):
|
|
"""Get files from SharePoint."""
|
|
conn = get_or_create_connector()
|
|
user_id = session["user_id"]
|
|
site_id = request.args.get("site_id")
|
|
path = request.args.get("path", "")
|
|
|
|
if not site_id:
|
|
return jsonify({"error": "site_id is required"}), 400
|
|
|
|
try:
|
|
client = SecureSharePointClient(conn, connection_id, user_id)
|
|
files = client.list_files(site_id, path)
|
|
|
|
return jsonify({"files": files})
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
@app.route("/api/sharepoint/<connection_id>/read")
|
|
@require_auth
|
|
@require_config
|
|
def read_file(connection_id):
|
|
"""Read file content with automatic parsing for different file types."""
|
|
conn = get_or_create_connector()
|
|
user_id = session["user_id"]
|
|
site_id = request.args.get("site_id")
|
|
file_path = request.args.get("file_path")
|
|
|
|
if not site_id or not file_path:
|
|
return jsonify({"error": "site_id and file_path are required"}), 400
|
|
|
|
try:
|
|
client = SecureSharePointClient(conn, connection_id, user_id)
|
|
|
|
# Get filename from path
|
|
filename = file_path.split('/')[-1]
|
|
|
|
# Read file as binary
|
|
binary_content = client.read_file(site_id, file_path, as_text=False)
|
|
|
|
# Try to parse the file
|
|
content = None
|
|
parse_error = None
|
|
|
|
if document_parser.can_parse(filename):
|
|
try:
|
|
content = document_parser.parse(binary_content, filename)
|
|
except Exception as parse_err:
|
|
parse_error = str(parse_err)
|
|
# Fallback to text decoding
|
|
try:
|
|
content = binary_content.decode('utf-8', errors='ignore')
|
|
except:
|
|
content = f"[Unable to parse file: {parse_error}]"
|
|
else:
|
|
# Unsupported file type - try text decode
|
|
try:
|
|
content = binary_content.decode('utf-8', errors='ignore')
|
|
except:
|
|
# Get file info for context
|
|
file_info = get_file_info(filename, len(binary_content))
|
|
content = f"[{file_info['category'].title()} file - {file_info['size_formatted']} - Preview not available for {file_info['extension']} files]"
|
|
|
|
# Store file context for chat
|
|
chat_key = f"{user_id}:{site_id}:{file_path}"
|
|
if chat_key not in CHAT_CONVERSATIONS:
|
|
CHAT_CONVERSATIONS[chat_key] = {
|
|
"content": content,
|
|
"messages": [],
|
|
"filename": filename
|
|
}
|
|
else:
|
|
CHAT_CONVERSATIONS[chat_key]["content"] = content
|
|
CHAT_CONVERSATIONS[chat_key]["filename"] = filename
|
|
|
|
# Optionally add to vector store if available and content is parseable
|
|
# This allows the document to be used in multi-document chat
|
|
document_id = None
|
|
if vector_store and content and (document_parser.can_parse(filename) or parse_error is None):
|
|
try:
|
|
# Don't add error messages to vector store
|
|
if not content.startswith("[") or not content.endswith("]"):
|
|
document_id = vector_store.add_document(
|
|
user_id=user_id,
|
|
site_id=site_id,
|
|
file_path=file_path,
|
|
filename=filename,
|
|
content=content,
|
|
tags=[], # No tags by default, user can add later
|
|
chunk_size=1000,
|
|
chunk_overlap=200
|
|
)
|
|
except Exception as vs_error:
|
|
# Don't fail the request if vector store fails
|
|
print(f"Warning: Could not add to vector store: {vs_error}")
|
|
|
|
return jsonify({
|
|
"content": content,
|
|
"filename": filename,
|
|
"can_chat": document_parser.can_parse(filename) or parse_error is None,
|
|
"document_id": document_id
|
|
})
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
# ============================================================================
|
|
# CHAT / LLM API ROUTES
|
|
# ============================================================================
|
|
|
|
@app.route("/api/chat/send", methods=["POST"])
|
|
@require_auth
|
|
def chat_send():
|
|
"""Send message to LLM about current document."""
|
|
try:
|
|
data = request.json
|
|
user_id = session["user_id"]
|
|
site_id = data.get("site_id")
|
|
file_path = data.get("file_path")
|
|
message = data.get("message")
|
|
|
|
if not site_id or not file_path or not message:
|
|
return jsonify({"error": "site_id, file_path, and message are required"}), 400
|
|
|
|
chat_key = f"{user_id}:{site_id}:{file_path}"
|
|
|
|
# Get or initialize conversation
|
|
if chat_key not in CHAT_CONVERSATIONS:
|
|
return jsonify({"error": "No document loaded. Please read a file first."}), 400
|
|
|
|
conversation = CHAT_CONVERSATIONS[chat_key]
|
|
document_content = conversation["content"]
|
|
messages = conversation["messages"]
|
|
|
|
# Add user message
|
|
messages.append({
|
|
"role": "user",
|
|
"content": message
|
|
})
|
|
|
|
# Get LLM response
|
|
try:
|
|
response = llm_client.chat(messages, context=document_content)
|
|
|
|
# Add assistant response
|
|
messages.append({
|
|
"role": "assistant",
|
|
"content": response
|
|
})
|
|
|
|
return jsonify({
|
|
"response": response,
|
|
"messages": messages
|
|
})
|
|
|
|
except Exception as llm_error:
|
|
# Remove user message if LLM failed
|
|
messages.pop()
|
|
return jsonify({"error": f"LLM error: {str(llm_error)}"}), 500
|
|
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
@app.route("/api/chat/history")
|
|
@require_auth
|
|
def chat_history():
|
|
"""Get chat history for current document."""
|
|
try:
|
|
user_id = session["user_id"]
|
|
site_id = request.args.get("site_id")
|
|
file_path = request.args.get("file_path")
|
|
|
|
if not site_id or not file_path:
|
|
return jsonify({"error": "site_id and file_path are required"}), 400
|
|
|
|
chat_key = f"{user_id}:{site_id}:{file_path}"
|
|
|
|
if chat_key not in CHAT_CONVERSATIONS:
|
|
return jsonify({"messages": []})
|
|
|
|
messages = CHAT_CONVERSATIONS[chat_key]["messages"]
|
|
return jsonify({"messages": messages})
|
|
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
@app.route("/api/chat/clear", methods=["POST"])
|
|
@require_auth
|
|
def chat_clear():
|
|
"""Clear chat history for current document."""
|
|
try:
|
|
data = request.json
|
|
user_id = session["user_id"]
|
|
site_id = data.get("site_id")
|
|
file_path = data.get("file_path")
|
|
|
|
if not site_id or not file_path:
|
|
return jsonify({"error": "site_id and file_path are required"}), 400
|
|
|
|
chat_key = f"{user_id}:{site_id}:{file_path}"
|
|
|
|
if chat_key in CHAT_CONVERSATIONS:
|
|
CHAT_CONVERSATIONS[chat_key]["messages"] = []
|
|
|
|
return jsonify({"success": True})
|
|
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
@app.route("/api/llm/status")
|
|
@require_auth
|
|
def llm_status():
|
|
"""Check if LLM is available."""
|
|
try:
|
|
available = llm_client.is_available()
|
|
return jsonify({
|
|
"available": available,
|
|
"provider": llm_provider
|
|
})
|
|
except Exception as e:
|
|
return jsonify({
|
|
"available": False,
|
|
"provider": llm_provider,
|
|
"error": str(e)
|
|
})
|
|
|
|
|
|
# ============================================================================
|
|
# MULTI-DOCUMENT CHAT / RAG API ROUTES
|
|
# ============================================================================
|
|
|
|
@app.route("/api/documents/add", methods=["POST"])
|
|
@require_auth
|
|
def add_document_to_vector_store():
|
|
"""Add document to vector store with tags for multi-document chat."""
|
|
if not vector_store:
|
|
return jsonify({"error": "Vector store not available"}), 503
|
|
|
|
try:
|
|
data = request.json
|
|
user_id = session["user_id"]
|
|
site_id = data.get("site_id")
|
|
file_path = data.get("file_path")
|
|
filename = data.get("filename")
|
|
content = data.get("content")
|
|
tags = data.get("tags", []) # e.g., ["HR", "SALES", "Q4-2024"]
|
|
|
|
if not all([site_id, file_path, filename, content]):
|
|
return jsonify({"error": "site_id, file_path, filename, and content are required"}), 400
|
|
|
|
# Add document to vector store
|
|
document_id = vector_store.add_document(
|
|
user_id=user_id,
|
|
site_id=site_id,
|
|
file_path=file_path,
|
|
filename=filename,
|
|
content=content,
|
|
tags=tags,
|
|
chunk_size=1000,
|
|
chunk_overlap=200
|
|
)
|
|
|
|
return jsonify({
|
|
"success": True,
|
|
"document_id": document_id,
|
|
"tags": tags
|
|
})
|
|
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
@app.route("/api/documents/tags")
|
|
@require_auth
|
|
def list_document_tags():
|
|
"""List all tags for user's documents with counts."""
|
|
if not vector_store:
|
|
return jsonify({"error": "Vector store not available"}), 503
|
|
|
|
try:
|
|
user_id = session["user_id"]
|
|
tag_counts = vector_store.list_tags(user_id)
|
|
|
|
return jsonify({
|
|
"tags": tag_counts
|
|
})
|
|
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
@app.route("/api/documents/indexed-sites")
|
|
@require_auth
|
|
def get_indexed_sites():
|
|
"""Get list of site IDs that have indexed documents."""
|
|
if not vector_store:
|
|
return jsonify({"error": "Vector store not available"}), 503
|
|
|
|
try:
|
|
user_id = session["user_id"]
|
|
site_ids = vector_store.get_indexed_sites(user_id)
|
|
|
|
return jsonify({
|
|
"site_ids": site_ids
|
|
})
|
|
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
@app.route("/api/documents/by-tags")
|
|
@require_auth
|
|
def get_documents_by_tags():
|
|
"""Get documents filtered by tags."""
|
|
if not vector_store:
|
|
return jsonify({"error": "Vector store not available"}), 503
|
|
|
|
try:
|
|
user_id = session["user_id"]
|
|
tags = request.args.get("tags", "").split(",")
|
|
tags = [t.strip() for t in tags if t.strip()]
|
|
|
|
if not tags:
|
|
return jsonify({"error": "tags parameter required"}), 400
|
|
|
|
documents = vector_store.get_documents_by_tags(user_id, tags)
|
|
|
|
return jsonify({
|
|
"documents": [
|
|
{
|
|
"document_id": doc.document_id,
|
|
"filename": doc.filename,
|
|
"file_path": doc.file_path,
|
|
"site_id": doc.site_id,
|
|
"tags": doc.tags,
|
|
"created_at": doc.created_at,
|
|
"chunk_count": doc.chunk_count
|
|
}
|
|
for doc in documents
|
|
]
|
|
})
|
|
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
@app.route("/api/documents/update-tags", methods=["POST"])
|
|
@require_auth
|
|
def update_document_tags():
|
|
"""Update tags for a document."""
|
|
if not vector_store:
|
|
return jsonify({"error": "Vector store not available"}), 503
|
|
|
|
try:
|
|
data = request.json
|
|
user_id = session["user_id"]
|
|
document_id = data.get("document_id")
|
|
tags = data.get("tags", [])
|
|
|
|
if not document_id:
|
|
return jsonify({"error": "document_id is required"}), 400
|
|
|
|
vector_store.update_document_tags(document_id, user_id, tags)
|
|
|
|
return jsonify({"success": True})
|
|
|
|
except ValueError as e:
|
|
return jsonify({"error": str(e)}), 403
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
@app.route("/api/chat/multi", methods=["POST"])
|
|
@require_auth
|
|
def chat_multi_documents():
|
|
"""Chat with multiple documents using RAG (Retrieval-Augmented Generation)."""
|
|
if not vector_store:
|
|
return jsonify({"error": "Vector store not available"}), 503
|
|
|
|
try:
|
|
data = request.json
|
|
user_id = session["user_id"]
|
|
message = data.get("message")
|
|
tags = data.get("tags", []) # Optional: filter by tags
|
|
top_k = data.get("top_k", 5) # Number of relevant chunks to retrieve
|
|
|
|
if not message:
|
|
return jsonify({"error": "message is required"}), 400
|
|
|
|
# Search for relevant document chunks
|
|
results = vector_store.search(
|
|
user_id=user_id,
|
|
query=message,
|
|
tags=tags if tags else None,
|
|
top_k=top_k
|
|
)
|
|
|
|
if not results:
|
|
return jsonify({
|
|
"error": "No relevant documents found. Please add documents to the vector store first."
|
|
}), 400
|
|
|
|
# Build context from retrieved chunks
|
|
context_parts = []
|
|
for i, result in enumerate(results, 1):
|
|
chunk = result["chunk"]
|
|
document = result["document"]
|
|
similarity = result["similarity"]
|
|
|
|
context_parts.append(
|
|
f"[Document {i}: {document.filename} (Tags: {', '.join(document.tags)}) - Similarity: {similarity:.2f}]\n"
|
|
f"{chunk.content}\n"
|
|
)
|
|
|
|
context = "\n---\n".join(context_parts)
|
|
|
|
# Get LLM response with RAG context
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": message
|
|
}
|
|
]
|
|
|
|
try:
|
|
response = llm_client.chat(messages, context=context)
|
|
|
|
# Build source information
|
|
sources = [
|
|
{
|
|
"filename": result["document"].filename,
|
|
"tags": result["document"].tags,
|
|
"similarity": result["similarity"],
|
|
"chunk_index": result["chunk"].chunk_index
|
|
}
|
|
for result in results
|
|
]
|
|
|
|
return jsonify({
|
|
"response": response,
|
|
"sources": sources,
|
|
"context_chunks": len(results)
|
|
})
|
|
|
|
except Exception as llm_error:
|
|
return jsonify({"error": f"LLM error: {str(llm_error)}"}), 500
|
|
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
@app.route("/api/chat/multi/stream", methods=["POST"])
|
|
@require_auth
|
|
def chat_multi_documents_stream():
|
|
"""Chat with multiple documents using RAG with streaming response."""
|
|
if not vector_store:
|
|
return jsonify({"error": "Vector store not available"}), 503
|
|
|
|
# Capture request data and session outside of generator
|
|
data = request.json
|
|
user_id = session["user_id"]
|
|
message = data.get("message")
|
|
tags = data.get("tags", [])
|
|
top_k = data.get("top_k", 5)
|
|
|
|
def generate():
|
|
try:
|
|
|
|
if not message:
|
|
yield f"data: {json.dumps({'error': 'message is required'})}\n\n"
|
|
return
|
|
|
|
# Search for relevant document chunks
|
|
results = vector_store.search(
|
|
user_id=user_id,
|
|
query=message,
|
|
tags=tags if tags else None,
|
|
top_k=top_k
|
|
)
|
|
|
|
if not results:
|
|
yield f"data: {json.dumps({'error': 'No relevant documents found'})}\n\n"
|
|
return
|
|
|
|
# Build context from retrieved chunks
|
|
context_parts = []
|
|
for i, result in enumerate(results, 1):
|
|
chunk = result["chunk"]
|
|
document = result["document"]
|
|
similarity = result["similarity"]
|
|
|
|
context_parts.append(
|
|
f"[Document {i}: {document.filename} (Tags: {', '.join(document.tags)}) - Similarity: {similarity:.2f}]\n"
|
|
f"{chunk.content}\n"
|
|
)
|
|
|
|
context = "\n---\n".join(context_parts)
|
|
|
|
# Build source information
|
|
sources = [
|
|
{
|
|
"filename": result["document"].filename,
|
|
"tags": result["document"].tags,
|
|
"similarity": result["similarity"],
|
|
"chunk_index": result["chunk"].chunk_index
|
|
}
|
|
for result in results
|
|
]
|
|
|
|
# Send sources first
|
|
yield f"data: {json.dumps({'sources': sources})}\n\n"
|
|
|
|
# Get LLM response with RAG context - streaming
|
|
messages = [{"role": "user", "content": message}]
|
|
|
|
try:
|
|
# Check if client supports streaming
|
|
if hasattr(llm_client, 'chat_stream'):
|
|
for chunk in llm_client.chat_stream(messages, context=context):
|
|
yield f"data: {json.dumps({'chunk': chunk})}\n\n"
|
|
else:
|
|
# Fallback to non-streaming
|
|
response = llm_client.chat(messages, context=context)
|
|
yield f"data: {json.dumps({'chunk': response})}\n\n"
|
|
|
|
yield f"data: {json.dumps({'done': True})}\n\n"
|
|
|
|
except Exception as llm_error:
|
|
yield f"data: {json.dumps({'error': f'LLM error: {str(llm_error)}'})}\n\n"
|
|
|
|
except Exception as e:
|
|
yield f"data: {json.dumps({'error': str(e)})}\n\n"
|
|
|
|
return app.response_class(generate(), mimetype='text/event-stream')
|
|
|
|
|
|
# ============================================================================
|
|
# BACKGROUND INDEXING API ROUTES
|
|
# ============================================================================
|
|
|
|
@app.route("/api/indexing/start", methods=["POST"])
|
|
@require_auth
|
|
@require_config
|
|
def start_site_indexing():
|
|
"""Start background indexing for a SharePoint site."""
|
|
if not vector_store:
|
|
return jsonify({"error": "Vector store not available"}), 503
|
|
|
|
try:
|
|
data = request.json
|
|
user_id = session["user_id"]
|
|
site_id = data.get("site_id")
|
|
site_name = data.get("site_name", "Unknown Site")
|
|
connection_id = data.get("connection_id")
|
|
path = data.get("path", "")
|
|
tags = data.get("tags", []) # Tags for all documents in this site
|
|
|
|
if not site_id or not connection_id:
|
|
return jsonify({"error": "site_id and connection_id are required"}), 400
|
|
|
|
# Generate job ID
|
|
import secrets
|
|
job_id = f"idx_{user_id}_{site_id}_{secrets.token_hex(4)}"
|
|
|
|
# Get connector
|
|
conn = get_or_create_connector()
|
|
if not conn:
|
|
return jsonify({"error": "Not configured"}), 400
|
|
|
|
# Start indexing in background
|
|
indexer = get_indexer()
|
|
job = indexer.start_indexing(
|
|
job_id=job_id,
|
|
site_id=site_id,
|
|
site_name=site_name,
|
|
connection_id=connection_id,
|
|
user_id=user_id,
|
|
connector=conn,
|
|
vector_store=vector_store,
|
|
document_parser=document_parser,
|
|
path=path,
|
|
tags=tags
|
|
)
|
|
|
|
return jsonify({
|
|
"success": True,
|
|
"job_id": job_id,
|
|
"message": f"Started indexing {site_name}"
|
|
})
|
|
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
@app.route("/api/indexing/status/<job_id>")
|
|
@require_auth
|
|
def get_indexing_status(job_id):
|
|
"""Get the status of a background indexing job."""
|
|
try:
|
|
indexer = get_indexer()
|
|
status = indexer.get_job_status(job_id)
|
|
|
|
if not status:
|
|
return jsonify({"error": "Job not found"}), 404
|
|
|
|
return jsonify(status)
|
|
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
@app.route("/api/indexing/cancel/<job_id>", methods=["POST"])
|
|
@require_auth
|
|
def cancel_indexing(job_id):
|
|
"""Cancel a running indexing job."""
|
|
try:
|
|
indexer = get_indexer()
|
|
cancelled = indexer.cancel_job(job_id)
|
|
|
|
if cancelled:
|
|
return jsonify({"success": True, "message": "Job cancelled"})
|
|
else:
|
|
return jsonify({"error": "Job not found or not running"}), 404
|
|
|
|
except Exception as e:
|
|
return jsonify({"error": str(e)}), 500
|
|
|
|
|
|
# ============================================================================
|
|
# HEALTH CHECK (for ECS)
|
|
# ============================================================================
|
|
|
|
@app.route("/health")
|
|
def health():
|
|
"""Health check endpoint for ECS."""
|
|
return jsonify({"status": "healthy"}), 200
|
|
|
|
|
|
# ============================================================================
|
|
# ERROR HANDLERS
|
|
# ============================================================================
|
|
|
|
@app.errorhandler(404)
|
|
def not_found(e):
|
|
return jsonify({"error": "Not found"}), 404
|
|
|
|
|
|
@app.errorhandler(500)
|
|
def internal_error(e):
|
|
return jsonify({"error": "Internal server error"}), 500
|
|
|
|
|
|
# ============================================================================
|
|
# MAIN
|
|
# ============================================================================
|
|
|
|
if __name__ == "__main__":
|
|
print("✅ SharePoint Connector starting (Production Mode)...")
|
|
print("📝 Users will enter their Azure credentials through the web interface")
|
|
print(f"🌎 AWS Region: {os.getenv('AWS_REGION', 'ap-southeast-2')}")
|
|
print(f"🏷️ Table Prefix: {os.getenv('TABLE_PREFIX', 'prod_')}")
|
|
print(f"🔒 Using IAM task role for DynamoDB access")
|
|
print("\n🚀 Open http://localhost:8000 in your browser\n")
|
|
print("📋 Users will need:")
|
|
print(" - Azure App Registration Client ID")
|
|
print(" - Azure App Registration Client Secret")
|
|
print(" - Tenant ID (or use 'common' for multi-tenant)")
|
|
print()
|
|
|
|
# Run Flask app (for local testing, in ECS use gunicorn)
|
|
app.run(
|
|
debug=False, # Never True in production
|
|
host="0.0.0.0",
|
|
port=int(os.getenv("PORT", 8000))
|
|
)
|