intesa_splunk_main/agent_runner.py

import os, sys, glob, json, ujson, gzip, pathlib, re
from typing import List, Dict, Any

from dotenv import load_dotenv
from notify import send_email
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain.tools import Tool
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

# ----- load .env (defaults to ./.env; override with ENV_FILE=/path/to/.env) -----
load_dotenv(os.getenv("ENV_FILE", ".env"))

# ----- read env (supports both AZURE_* and AOAI_*) -----
def _norm_endpoint(ep: str | None) -> str:
    if not ep: return ""
    ep = ep.strip().rstrip("/")
    # strip any trailing /openai[/v...]
    ep = re.sub(r"/openai(?:/v\d+(?:\.\d+)?(?:-\w+)?)?$", "", ep)
    return ep + "/"

AZ_ENDPOINT = _norm_endpoint(
    os.getenv("AZURE_OPENAI_ENDPOINT") or os.getenv("AOAI_ENDPOINT")
)
AZ_API_KEY = (
    os.getenv("AZURE_OPENAI_API_KEY")
    or os.getenv("AOAI_API_KEY")
    or os.getenv("OPENAI_API_KEY")
)
AZ_API_VERSION = (
    os.getenv("AZURE_OPENAI_API_VERSION")
    or os.getenv("AOAI_API_VERSION")
    or "2025-01-01-preview"
)
AZ_CHAT_DEPLOY = (
    os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT")
    or os.getenv("AOAI_CHAT_DEPLOYMENT")
    or "gpt-4o-mini"
)
AZ_EMBED_DEPLOY = (
    os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT")
    or os.getenv("AOAI_EMBED_DEPLOYMENT")
    or ""
)

# ----- local data config -----
CHUNK_DIR = os.getenv("CHUNK_DIR", "./out")
BLOB_DIR  = os.getenv("BLOB_DIR", "")
TOP_K     = int(os.getenv("TOP_K", "12"))

# ---------- Helpers to build LLM/Embeddings for Azure OpenAI ----------
def make_llm(temperature: float = 0.2) -> AzureChatOpenAI:
    if not AZ_ENDPOINT or not AZ_API_KEY:
        raise RuntimeError("Set AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY (or AOAI_* equivalents).")
    return AzureChatOpenAI(
        azure_endpoint=AZ_ENDPOINT,
        api_key=AZ_API_KEY,
        api_version=AZ_API_VERSION,
        azure_deployment=AZ_CHAT_DEPLOY,
        temperature=temperature,
    )

def make_embeddings() -> AzureOpenAIEmbeddings | None:
    if not AZ_EMBED_DEPLOY:
        return None
    return AzureOpenAIEmbeddings(
        azure_endpoint=AZ_ENDPOINT,
        api_key=AZ_API_KEY,
        api_version=AZ_API_VERSION,
        azure_deployment=AZ_EMBED_DEPLOY,
    )

# ---------- Load JSONL chunk files ----------
def _iter_chunk_files() -> List[pathlib.Path]:
    paths: List[pathlib.Path] = []
    if CHUNK_DIR and pathlib.Path(CHUNK_DIR).exists():
        paths += [pathlib.Path(p) for p in glob.glob(f"{CHUNK_DIR}/chunk_*.jsonl*")]
    if BLOB_DIR and pathlib.Path(BLOB_DIR).exists():
        paths += [pathlib.Path(p) for p in glob.glob(f"{BLOB_DIR}/**/chunk_*.jsonl*", recursive=True)]
    return sorted(paths, key=lambda p: p.stat().st_mtime, reverse=True)

def _read_jsonl(path: pathlib.Path) -> List[Dict[str, Any]]:
    data = path.read_bytes()
    if path.suffix == ".gz":
        data = gzip.decompress(data)
    out: List[Dict[str, Any]] = []
    for ln in data.splitlines():
        if not ln.strip(): continue
        try:
            out.append(ujson.loads(ln))
        except Exception:
            continue
    return out

# Accept either raw events or HEC-shaped {"event": {...}}
def _normalize_event(rec: Dict[str, Any]) -> Dict[str, Any]:
    return rec.get("event", rec)

def _evt_to_text(evt: Dict[str, Any]) -> str:
    keys = ["event_type","transaction_id","step","status","importo","divisa","istantaneo",
            "spese_commissioni","causale","data_pagamento","iban_origin_masked","iban_dest_masked",
            "vop_check","vop_score","bic_swift","latency_ms","device","os","browser","geo"]
    parts = [f"{k}={evt[k]}" for k in keys if evt.get(k) is not None]
    return "bonifico | " + " | ".join(parts)

# ---------- Build vector store (only if embeddings deployment exists) ----------
def build_vectorstore(limit_files: int = 20):
    embs = make_embeddings()
    if embs is None:
        raise RuntimeError("No embeddings deployment set. Export AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT.")
    files = _iter_chunk_files()[:limit_files]
    if not files:
        raise RuntimeError("No chunk files found; set CHUNK_DIR or BLOB_DIR.")
    docs, meta_index = [], []
    for fp in files:
        rows = _read_jsonl(fp)
        for rec in rows:
            evt = _normalize_event(rec)
            docs.append(Document(
                page_content=_evt_to_text(evt),
                metadata={"file": fp.name, **{k: evt.get(k) for k in ("transaction_id","step","status")}}
            ))
            meta_index.append(evt)
    vs = FAISS.from_documents(docs, embs)
    return vs, meta_index

# ---------- Tools ----------
def stats_tool_impl(query: str = "") -> str:
    """
    Filters supported in `query` (space-separated):
      status:<accepted|pending|rejected>
      step:<compila|conferma|esito>
      divisa:<EUR|USD|GBP>
      instant:<true|false>
      vop:<no_match|close_match|match>
      min_amount:<float>
      iban_country:<2-letter e.g., IT>
    Examples:
      'status:rejected min_amount:10000'
      'vop:no_match step:esito'
      'divisa:EUR instant:true'
    """
    # load recent events into memory
    files = _iter_chunk_files()[:20]
    events = []
    for fp in files:
        for rec in _read_jsonl(fp):
            events.append(_normalize_event(rec))

    # parse filters
    q = query.lower()
    def _kv(key, pat=r"([^\s]+)"):
        m = re.search(fr"{key}:{pat}", q)
        return m.group(1) if m else None

    status_f = _kv("status")
    step_f   = _kv("step")
    div_f    = _kv("divisa")
    vop_f    = _kv("vop")
    country  = _kv("iban_country")
    instant_s = _kv("instant")
    min_amt_s = _kv("min_amount")
    min_amt   = float(min_amt_s) if min_amt_s else 0.0
    inst_f = None
    if instant_s in {"true","false"}:
        inst_f = (instant_s == "true")

    def _boolish(x):
        if isinstance(x, bool): return x
        if isinstance(x, str): return x.lower() in {"true","1","yes"}
        return False

    def keep(e):
        try: amt = float(e.get("importo", 0) or 0)
        except: amt = 0.0
        if amt < min_amt: return False
        if status_f and (str(e.get("status","")).lower() != status_f): return False
        if step_f   and (str(e.get("step","")).lower()   != step_f):   return False
        if div_f    and (str(e.get("divisa","")).upper() != div_f.upper()): return False
        if vop_f:
            v = str(e.get("vop_check","")).lower()
            if v != vop_f: return False
        if inst_f is not None and _boolish(e.get("instantaneo") or e.get("istantaneo")) != inst_f:
            return False
        if country:
            # heuristic from IBAN (dest or origin)
            iban = (e.get("iban_dest_masked") or e.get("iban_origin_masked") or "").upper()
            if not iban.startswith(country.upper()):
                return False
        return True

    filtered = [e for e in events if keep(e)]

    total = len(filtered)
    rej = sum(1 for e in filtered if str(e.get("status","")).lower()=="rejected")
    amt_sum = 0.0; hi = 0.0; hi_tx = None
    for e in filtered:
        try: amt = float(e.get("importo", 0) or 0)
        except: amt = 0.0
        amt_sum += amt
        if amt > hi:
            hi, hi_tx = amt, e.get("transaction_id")
    return f"events={total}, rejected={rej}, rejection_rate={round(rej/max(total,1),3)}, total_amount={round(amt_sum,2)}, max_amount={hi} (tx={hi_tx})"

def retrieve_tool_impl(question: str) -> str:
    vs, _ = build_vectorstore()
    docs = vs.similarity_search(question, k=TOP_K)
    return "\n".join(f"[{i+1}] {d.page_content}" for i, d in enumerate(docs))

def raw_sample_tool_impl(arg: str = "") -> str:
    """
    Return a few raw JSON events from the newest chunks.
    Accepts the same filters as get_stats PLUS optional 'n:<int>' to control how many.
    Examples:
      'n:5 status:rejected min_amount:10000'
      'divisa:EUR instant:true step:esito n:3'
    """
    q = (arg or "").lower()

    # helpers (same parsing as get_stats)
    def _kv(key, pat=r"([^\s]+)"):
        m = re.search(fr"{key}:{pat}", q)
        return m.group(1) if m else None

    n_s      = _kv("n", r"(\d+)")
    n        = int(n_s) if n_s else 5
    status_f = _kv("status")
    step_f   = _kv("step")
    div_f    = _kv("divisa")
    vop_f    = _kv("vop")
    country  = _kv("iban_country")
    instant_s = _kv("instant")
    min_amt_s = _kv("min_amount")
    min_amt   = float(min_amt_s) if min_amt_s else 0.0

    inst_f = None
    if instant_s in {"true","false"}:
        inst_f = (instant_s == "true")

    def _boolish(x):
        if isinstance(x, bool): return x
        if isinstance(x, str):  return x.lower() in {"true","1","yes"}
        return False

    def keep(e):
        try: amt = float(e.get("importo", 0) or 0)
        except: amt = 0.0
        if amt < min_amt: return False
        if status_f and (str(e.get("status","")).lower() != status_f): return False
        if step_f   and (str(e.get("step","")).lower()   != step_f):   return False
        if div_f    and (str(e.get("divisa","")).upper() != div_f.upper()): return False
        if vop_f:
            v = str(e.get("vop_check","")).lower()
            if v != vop_f: return False
        if inst_f is not None and _boolish(e.get("instantaneo") or e.get("istantaneo")) != inst_f:
            return False
        if country:
            iban = (e.get("iban_dest_masked") or e.get("iban_origin_masked") or "").upper()
            if not iban.startswith(country.upper()):
                return False
        return True

    # load newest events and filter
    files = _iter_chunk_files()
    out = []
    for fp in files:
        for rec in _read_jsonl(fp):
            evt = _normalize_event(rec)
            if keep(evt):
                out.append(json.dumps(evt, ensure_ascii=False))
                if len(out) >= n:
                    break
        if len(out) >= n:
            break

    if not out:
        return "(no matching events)"
    return "\n".join(out)


# ---------- Build the agent ----------
def build_agent():
    llm = make_llm(temperature=0.2)
    tools = [
        Tool(name="get_stats", func=stats_tool_impl,
             description="Quick stats over recent events. Example: 'status:rejected min_amount:10000 step:esito'."),
        Tool(name="raw_samples", func=raw_sample_tool_impl,
             description="Return a few raw JSON events. Accepts filters like get_stats and 'n:<int>'. Example: 'n:5 status:rejected min_amount:10000'.")
    ]
    if AZ_EMBED_DEPLOY:
        tools.append(Tool(name="retrieve_similar", func=retrieve_tool_impl,
                          description="Semantic search over logs. Ask a question about bonifico logs."))

    system = """You are a payments log analyst. Use the tools to inspect recent Splunk-derived logs for 'bonifico' events.
- Prefer 'get_stats' for quick metrics (rejection rate, totals).
- Use 'retrieve_similar' (if available) to pull relevant examples before concluding.
- When asked for anomalies, treat as suspicious: rejected EUR >= 10,000, 'vop_no_match', invalid IBAN/BIC, unusual spikes.
Return a short, structured report with: Findings, Evidence, and Recommended actions."""

    prompt = ChatPromptTemplate.from_messages([
        ("system", system),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
        MessagesPlaceholder("agent_scratchpad"),
    ])
    agent = create_tool_calling_agent(llm, tools, prompt)
    return AgentExecutor(agent=agent, tools=tools, verbose=True, handle_parsing_errors=True)

def run_default_question(question_override: str | None = None):
    agent = build_agent()
    question = question_override or (
        "Scan the latest chunks. List any anomalies "
        "(rejected EUR >= 10000, vop_no_match, invalid IBAN/BIC). "
        "Give a brief summary and next steps."
    )
    out = agent.invoke({"input": question, "chat_history": []})
    result = out.get("output", "")
    print("\n=== AGENT OUTPUT ===\n", result)

    # Email the result if MAIL_ENABLED=true (handled inside notify.py)
    try:
        send_email(subject="[Intesa Logs] Agent Report", body_text=result)
    except Exception as e:
        print("[notify] email failed:", e)

if __name__ == "__main__":
    # optional CLI: allow a custom question
    custom = " ".join(sys.argv[1:]) if len(sys.argv) > 1 else None
    run_default_question(custom if custom else None)