Email sending funcitonality
Env vars are set in the .env
This commit is contained in:
parent
ee1526cfd9
commit
2c0f9704b3
339
agent_runner.py
339
agent_runner.py
@ -1,154 +1,303 @@
|
||||
import os, glob, json, ujson, gzip, pathlib, re
|
||||
import os, sys, glob, json, ujson, gzip, pathlib, re
|
||||
from typing import List, Dict, Any
|
||||
from dataclasses import dataclass
|
||||
|
||||
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
||||
from dotenv import load_dotenv
|
||||
from notify import send_email
|
||||
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
|
||||
from langchain_community.vectorstores import FAISS
|
||||
from langchain_core.documents import Document
|
||||
from langchain.tools import Tool
|
||||
from langchain.agents import AgentExecutor, create_tool_calling_agent
|
||||
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
||||
|
||||
#export OPENAI_API_KEY="sk-..."
|
||||
#SET API KEY^
|
||||
# ----- load .env (defaults to ./.env; override with ENV_FILE=/path/to/.env) -----
|
||||
load_dotenv(os.getenv("ENV_FILE", ".env"))
|
||||
|
||||
# ---------- Config ----------
|
||||
MODEL = os.getenv("LLM_MODEL", "gpt-4o-mini")
|
||||
EMB_MODEL = os.getenv("EMB_MODEL", "text-embedding-3-small")
|
||||
CHUNK_DIR = os.getenv("CHUNK_DIR", "./out") # poller file sink
|
||||
BLOB_DIR = os.getenv("BLOB_DIR", "") # optional local mirror of blobs
|
||||
# ----- read env (supports both AZURE_* and AOAI_*) -----
|
||||
def _norm_endpoint(ep: str | None) -> str:
|
||||
if not ep: return ""
|
||||
ep = ep.strip().rstrip("/")
|
||||
# strip any trailing /openai[/v...]
|
||||
ep = re.sub(r"/openai(?:/v\d+(?:\.\d+)?(?:-\w+)?)?$", "", ep)
|
||||
return ep + "/"
|
||||
|
||||
AZ_ENDPOINT = _norm_endpoint(
|
||||
os.getenv("AZURE_OPENAI_ENDPOINT") or os.getenv("AOAI_ENDPOINT")
|
||||
)
|
||||
AZ_API_KEY = (
|
||||
os.getenv("AZURE_OPENAI_API_KEY")
|
||||
or os.getenv("AOAI_API_KEY")
|
||||
or os.getenv("OPENAI_API_KEY")
|
||||
)
|
||||
AZ_API_VERSION = (
|
||||
os.getenv("AZURE_OPENAI_API_VERSION")
|
||||
or os.getenv("AOAI_API_VERSION")
|
||||
or "2025-01-01-preview"
|
||||
)
|
||||
AZ_CHAT_DEPLOY = (
|
||||
os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT")
|
||||
or os.getenv("AOAI_CHAT_DEPLOYMENT")
|
||||
or "gpt-4o-mini"
|
||||
)
|
||||
AZ_EMBED_DEPLOY = (
|
||||
os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT")
|
||||
or os.getenv("AOAI_EMBED_DEPLOYMENT")
|
||||
or ""
|
||||
)
|
||||
|
||||
# ----- local data config -----
|
||||
CHUNK_DIR = os.getenv("CHUNK_DIR", "./out")
|
||||
BLOB_DIR = os.getenv("BLOB_DIR", "")
|
||||
TOP_K = int(os.getenv("TOP_K", "12"))
|
||||
|
||||
# ---------- Helpers to build LLM/Embeddings for Azure OpenAI ----------
|
||||
def make_llm(temperature: float = 0.2) -> AzureChatOpenAI:
|
||||
if not AZ_ENDPOINT or not AZ_API_KEY:
|
||||
raise RuntimeError("Set AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY (or AOAI_* equivalents).")
|
||||
return AzureChatOpenAI(
|
||||
azure_endpoint=AZ_ENDPOINT,
|
||||
api_key=AZ_API_KEY,
|
||||
api_version=AZ_API_VERSION,
|
||||
azure_deployment=AZ_CHAT_DEPLOY,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
def make_embeddings() -> AzureOpenAIEmbeddings | None:
|
||||
if not AZ_EMBED_DEPLOY:
|
||||
return None
|
||||
return AzureOpenAIEmbeddings(
|
||||
azure_endpoint=AZ_ENDPOINT,
|
||||
api_key=AZ_API_KEY,
|
||||
api_version=AZ_API_VERSION,
|
||||
azure_deployment=AZ_EMBED_DEPLOY,
|
||||
)
|
||||
|
||||
# ---------- Load JSONL chunk files ----------
|
||||
def _iter_chunk_files() -> List[pathlib.Path]:
|
||||
paths = []
|
||||
paths: List[pathlib.Path] = []
|
||||
if CHUNK_DIR and pathlib.Path(CHUNK_DIR).exists():
|
||||
paths += [pathlib.Path(p) for p in glob.glob(f"{CHUNK_DIR}/chunk_*.jsonl*")]
|
||||
if BLOB_DIR and pathlib.Path(BLOB_DIR).exists():
|
||||
paths += [pathlib.Path(p) for p in glob.glob(f"{BLOB_DIR}/**/chunk_*.jsonl*", recursive=True)]
|
||||
# newest first
|
||||
return sorted(paths, key=lambda p: p.stat().st_mtime, reverse=True)
|
||||
|
||||
def _read_jsonl(path: pathlib.Path) -> List[Dict[str, Any]]:
|
||||
data = path.read_bytes()
|
||||
if path.suffix == ".gz":
|
||||
data = gzip.decompress(data)
|
||||
lines = data.splitlines()
|
||||
out = []
|
||||
for ln in lines:
|
||||
if not ln.strip():
|
||||
continue
|
||||
out: List[Dict[str, Any]] = []
|
||||
for ln in data.splitlines():
|
||||
if not ln.strip(): continue
|
||||
try:
|
||||
out.append(ujson.loads(ln))
|
||||
except Exception:
|
||||
# tolerate partial/corrupt lines
|
||||
continue
|
||||
return out
|
||||
|
||||
# Accept either raw events or HEC-shaped {"event": {...}}
|
||||
def _normalize_event(rec: Dict[str, Any]) -> Dict[str, Any]:
|
||||
evt = rec.get("event", rec)
|
||||
# Ensure strings for some fields if needed
|
||||
return evt
|
||||
return rec.get("event", rec)
|
||||
|
||||
def _evt_to_text(evt: Dict[str, Any]) -> str:
|
||||
# Compact text for embedding/RAG
|
||||
parts = []
|
||||
for k in ["event_type","transaction_id","step","status","importo","divisa","istantaneo",
|
||||
"spese_commissioni","causale","data_pagamento","iban_origin_masked","iban_dest_masked",
|
||||
"vop_check","vop_score","bic_swift","latency_ms","device","os","browser","geo"]:
|
||||
v = evt.get(k)
|
||||
if v is not None:
|
||||
parts.append(f"{k}={v}")
|
||||
keys = ["event_type","transaction_id","step","status","importo","divisa","istantaneo",
|
||||
"spese_commissioni","causale","data_pagamento","iban_origin_masked","iban_dest_masked",
|
||||
"vop_check","vop_score","bic_swift","latency_ms","device","os","browser","geo"]
|
||||
parts = [f"{k}={evt[k]}" for k in keys if evt.get(k) is not None]
|
||||
return "bonifico | " + " | ".join(parts)
|
||||
|
||||
# ---------- Build vector store ----------
|
||||
# ---------- Build vector store (only if embeddings deployment exists) ----------
|
||||
def build_vectorstore(limit_files: int = 20):
|
||||
embs = make_embeddings()
|
||||
if embs is None:
|
||||
raise RuntimeError("No embeddings deployment set. Export AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT.")
|
||||
files = _iter_chunk_files()[:limit_files]
|
||||
if not files:
|
||||
raise RuntimeError("No chunk files found; set CHUNK_DIR or BLOB_DIR")
|
||||
docs = []
|
||||
meta_index = []
|
||||
raise RuntimeError("No chunk files found; set CHUNK_DIR or BLOB_DIR.")
|
||||
docs, meta_index = [], []
|
||||
for fp in files:
|
||||
rows = _read_jsonl(fp)
|
||||
for rec in rows:
|
||||
evt = _normalize_event(rec)
|
||||
txt = _evt_to_text(evt)
|
||||
docs.append(Document(page_content=txt, metadata={"file": fp.name, **{k: evt.get(k) for k in ("transaction_id","step","status")}}))
|
||||
docs.append(Document(
|
||||
page_content=_evt_to_text(evt),
|
||||
metadata={"file": fp.name, **{k: evt.get(k) for k in ("transaction_id","step","status")}}
|
||||
))
|
||||
meta_index.append(evt)
|
||||
embeddings = OpenAIEmbeddings(model=EMB_MODEL)
|
||||
vs = FAISS.from_documents(docs, embeddings)
|
||||
vs = FAISS.from_documents(docs, embs)
|
||||
return vs, meta_index
|
||||
|
||||
# ---------- Handy utilities (tools) ----------
|
||||
# ---------- Tools ----------
|
||||
def stats_tool_impl(query: str = "") -> str:
|
||||
"""
|
||||
Return quick stats from latest chunks. Query supports simple filters like:
|
||||
'status:rejected min_amount:10000 step:esito'
|
||||
Filters supported in `query` (space-separated):
|
||||
status:<accepted|pending|rejected>
|
||||
step:<compila|conferma|esito>
|
||||
divisa:<EUR|USD|GBP>
|
||||
instant:<true|false>
|
||||
vop:<no_match|close_match|match>
|
||||
min_amount:<float>
|
||||
iban_country:<2-letter e.g., IT>
|
||||
Examples:
|
||||
'status:rejected min_amount:10000'
|
||||
'vop:no_match step:esito'
|
||||
'divisa:EUR instant:true'
|
||||
"""
|
||||
import math
|
||||
vs, meta_index = build_vectorstore()
|
||||
# simple filter pass
|
||||
min_amount, step, status = 0.0, None, None
|
||||
m = re.search(r"min_amount:(\d+(\.\d+)?)", query); min_amount = float(m.group(1)) if m else 0.0
|
||||
m = re.search(r"step:(\w+)", query); step = m.group(1) if m else None
|
||||
m = re.search(r"status:(\w+)", query); status = m.group(1) if m else None
|
||||
# load recent events into memory
|
||||
files = _iter_chunk_files()[:20]
|
||||
events = []
|
||||
for fp in files:
|
||||
for rec in _read_jsonl(fp):
|
||||
events.append(_normalize_event(rec))
|
||||
|
||||
total = 0; rej = 0; amt_sum = 0.0; hi = 0.0; hi_tx = None
|
||||
for evt in meta_index:
|
||||
try:
|
||||
amt = float(evt.get("importo", 0))
|
||||
except Exception:
|
||||
amt = 0.0
|
||||
if amt < min_amount: continue
|
||||
if step and evt.get("step") != step: continue
|
||||
if status and evt.get("status") != status: continue
|
||||
total += 1
|
||||
# parse filters
|
||||
q = query.lower()
|
||||
def _kv(key, pat=r"([^\s]+)"):
|
||||
m = re.search(fr"{key}:{pat}", q)
|
||||
return m.group(1) if m else None
|
||||
|
||||
status_f = _kv("status")
|
||||
step_f = _kv("step")
|
||||
div_f = _kv("divisa")
|
||||
vop_f = _kv("vop")
|
||||
country = _kv("iban_country")
|
||||
instant_s = _kv("instant")
|
||||
min_amt_s = _kv("min_amount")
|
||||
min_amt = float(min_amt_s) if min_amt_s else 0.0
|
||||
inst_f = None
|
||||
if instant_s in {"true","false"}:
|
||||
inst_f = (instant_s == "true")
|
||||
|
||||
def _boolish(x):
|
||||
if isinstance(x, bool): return x
|
||||
if isinstance(x, str): return x.lower() in {"true","1","yes"}
|
||||
return False
|
||||
|
||||
def keep(e):
|
||||
try: amt = float(e.get("importo", 0) or 0)
|
||||
except: amt = 0.0
|
||||
if amt < min_amt: return False
|
||||
if status_f and (str(e.get("status","")).lower() != status_f): return False
|
||||
if step_f and (str(e.get("step","")).lower() != step_f): return False
|
||||
if div_f and (str(e.get("divisa","")).upper() != div_f.upper()): return False
|
||||
if vop_f:
|
||||
v = str(e.get("vop_check","")).lower()
|
||||
if v != vop_f: return False
|
||||
if inst_f is not None and _boolish(e.get("instantaneo") or e.get("istantaneo")) != inst_f:
|
||||
return False
|
||||
if country:
|
||||
# heuristic from IBAN (dest or origin)
|
||||
iban = (e.get("iban_dest_masked") or e.get("iban_origin_masked") or "").upper()
|
||||
if not iban.startswith(country.upper()):
|
||||
return False
|
||||
return True
|
||||
|
||||
filtered = [e for e in events if keep(e)]
|
||||
|
||||
total = len(filtered)
|
||||
rej = sum(1 for e in filtered if str(e.get("status","")).lower()=="rejected")
|
||||
amt_sum = 0.0; hi = 0.0; hi_tx = None
|
||||
for e in filtered:
|
||||
try: amt = float(e.get("importo", 0) or 0)
|
||||
except: amt = 0.0
|
||||
amt_sum += amt
|
||||
if evt.get("status") == "rejected": rej += 1
|
||||
if amt > hi: hi, hi_tx = amt, evt.get("transaction_id")
|
||||
rr = f"events={total}, rejected={rej}, rejection_rate={round(rej/max(total,1),3)}, total_amount={round(amt_sum,2)}, max_amount={hi} (tx={hi_tx})"
|
||||
return rr
|
||||
if amt > hi:
|
||||
hi, hi_tx = amt, e.get("transaction_id")
|
||||
return f"events={total}, rejected={rej}, rejection_rate={round(rej/max(total,1),3)}, total_amount={round(amt_sum,2)}, max_amount={hi} (tx={hi_tx})"
|
||||
|
||||
def retrieve_tool_impl(question: str) -> str:
|
||||
"""Semantic retrieve top-K log snippets related to the question."""
|
||||
vs, _ = build_vectorstore()
|
||||
docs = vs.similarity_search(question, k=TOP_K)
|
||||
lines = [f"[{i+1}] {d.page_content}" for i,d in enumerate(docs)]
|
||||
return "\n".join(lines)
|
||||
return "\n".join(f"[{i+1}] {d.page_content}" for i, d in enumerate(docs))
|
||||
|
||||
def raw_sample_tool_impl(n: int = 5) -> str:
|
||||
"""Return n raw events (JSON) from the newest chunks."""
|
||||
def raw_sample_tool_impl(arg: str = "") -> str:
|
||||
"""
|
||||
Return a few raw JSON events from the newest chunks.
|
||||
Accepts the same filters as get_stats PLUS optional 'n:<int>' to control how many.
|
||||
Examples:
|
||||
'n:5 status:rejected min_amount:10000'
|
||||
'divisa:EUR instant:true step:esito n:3'
|
||||
"""
|
||||
q = (arg or "").lower()
|
||||
|
||||
# helpers (same parsing as get_stats)
|
||||
def _kv(key, pat=r"([^\s]+)"):
|
||||
m = re.search(fr"{key}:{pat}", q)
|
||||
return m.group(1) if m else None
|
||||
|
||||
n_s = _kv("n", r"(\d+)")
|
||||
n = int(n_s) if n_s else 5
|
||||
status_f = _kv("status")
|
||||
step_f = _kv("step")
|
||||
div_f = _kv("divisa")
|
||||
vop_f = _kv("vop")
|
||||
country = _kv("iban_country")
|
||||
instant_s = _kv("instant")
|
||||
min_amt_s = _kv("min_amount")
|
||||
min_amt = float(min_amt_s) if min_amt_s else 0.0
|
||||
|
||||
inst_f = None
|
||||
if instant_s in {"true","false"}:
|
||||
inst_f = (instant_s == "true")
|
||||
|
||||
def _boolish(x):
|
||||
if isinstance(x, bool): return x
|
||||
if isinstance(x, str): return x.lower() in {"true","1","yes"}
|
||||
return False
|
||||
|
||||
def keep(e):
|
||||
try: amt = float(e.get("importo", 0) or 0)
|
||||
except: amt = 0.0
|
||||
if amt < min_amt: return False
|
||||
if status_f and (str(e.get("status","")).lower() != status_f): return False
|
||||
if step_f and (str(e.get("step","")).lower() != step_f): return False
|
||||
if div_f and (str(e.get("divisa","")).upper() != div_f.upper()): return False
|
||||
if vop_f:
|
||||
v = str(e.get("vop_check","")).lower()
|
||||
if v != vop_f: return False
|
||||
if inst_f is not None and _boolish(e.get("instantaneo") or e.get("istantaneo")) != inst_f:
|
||||
return False
|
||||
if country:
|
||||
iban = (e.get("iban_dest_masked") or e.get("iban_origin_masked") or "").upper()
|
||||
if not iban.startswith(country.upper()):
|
||||
return False
|
||||
return True
|
||||
|
||||
# load newest events and filter
|
||||
files = _iter_chunk_files()
|
||||
out = []
|
||||
for fp in files:
|
||||
for rec in _read_jsonl(fp):
|
||||
out.append(json.dumps(_normalize_event(rec), ensure_ascii=False))
|
||||
if len(out) >= n: break
|
||||
if len(out) >= n: break
|
||||
evt = _normalize_event(rec)
|
||||
if keep(evt):
|
||||
out.append(json.dumps(evt, ensure_ascii=False))
|
||||
if len(out) >= n:
|
||||
break
|
||||
if len(out) >= n:
|
||||
break
|
||||
|
||||
if not out:
|
||||
return "(no matching events)"
|
||||
return "\n".join(out)
|
||||
|
||||
|
||||
# ---------- Build the agent ----------
|
||||
def build_agent():
|
||||
llm = ChatOpenAI(model=MODEL, temperature=0.2)
|
||||
|
||||
llm = make_llm(temperature=0.2)
|
||||
tools = [
|
||||
Tool(name="get_stats",
|
||||
func=stats_tool_impl,
|
||||
description="Quick stats over recent events. Usage: pass a filter string like 'status:rejected min_amount:10000 step:esito'."),
|
||||
Tool(name="retrieve_similar",
|
||||
func=retrieve_tool_impl,
|
||||
description="Semantic search over logs. Pass a natural-language question about bonifico logs."),
|
||||
Tool(name="raw_samples",
|
||||
func=raw_sample_tool_impl,
|
||||
description="Return a few raw JSON events to inspect fields.")
|
||||
Tool(name="get_stats", func=stats_tool_impl,
|
||||
description="Quick stats over recent events. Example: 'status:rejected min_amount:10000 step:esito'."),
|
||||
Tool(name="raw_samples", func=raw_sample_tool_impl,
|
||||
description="Return a few raw JSON events. Accepts filters like get_stats and 'n:<int>'. Example: 'n:5 status:rejected min_amount:10000'.")
|
||||
]
|
||||
if AZ_EMBED_DEPLOY:
|
||||
tools.append(Tool(name="retrieve_similar", func=retrieve_tool_impl,
|
||||
description="Semantic search over logs. Ask a question about bonifico logs."))
|
||||
|
||||
system = """You are a payments log analyst. Use the tools to inspect recent Splunk-derived logs for 'bonifico' events.
|
||||
- Prefer 'get_stats' for quick metrics (rejection rate, totals).
|
||||
- Use 'retrieve_similar' to pull relevant examples before concluding.
|
||||
- When asked for anomalies, treat as suspicious: rejected EUR transfers >= 10,000, 'vop_no_match', invalid IBAN/BIC, unusual spikes.
|
||||
Return a short, structured report with: Findings, Evidence (IDs/fields), and Recommended actions."""
|
||||
- Use 'retrieve_similar' (if available) to pull relevant examples before concluding.
|
||||
- When asked for anomalies, treat as suspicious: rejected EUR >= 10,000, 'vop_no_match', invalid IBAN/BIC, unusual spikes.
|
||||
Return a short, structured report with: Findings, Evidence, and Recommended actions."""
|
||||
|
||||
prompt = ChatPromptTemplate.from_messages([
|
||||
("system", system),
|
||||
@ -156,19 +305,27 @@ Return a short, structured report with: Findings, Evidence (IDs/fields), and Rec
|
||||
("human", "{input}"),
|
||||
MessagesPlaceholder("agent_scratchpad"),
|
||||
])
|
||||
|
||||
agent = create_tool_calling_agent(llm, tools, prompt)
|
||||
executor = AgentExecutor(agent=agent, tools=tools, verbose=True, handle_parsing_errors=True)
|
||||
return executor
|
||||
return AgentExecutor(agent=agent, tools=tools, verbose=True, handle_parsing_errors=True)
|
||||
|
||||
def run_default_question():
|
||||
def run_default_question(question_override: str | None = None):
|
||||
agent = build_agent()
|
||||
question = (
|
||||
"Scan the latest chunks. List any anomalies (e.g., rejected EUR >= 10000, vop_no_match, invalid IBAN/BIC). "
|
||||
question = question_override or (
|
||||
"Scan the latest chunks. List any anomalies "
|
||||
"(rejected EUR >= 10000, vop_no_match, invalid IBAN/BIC). "
|
||||
"Give a brief summary and next steps."
|
||||
)
|
||||
out = agent.invoke({"input": question, "chat_history": []})
|
||||
print("\n=== AGENT OUTPUT ===\n", out["output"])
|
||||
result = out.get("output", "")
|
||||
print("\n=== AGENT OUTPUT ===\n", result)
|
||||
|
||||
# Email the result if MAIL_ENABLED=true (handled inside notify.py)
|
||||
try:
|
||||
send_email(subject="[Intesa Logs] Agent Report", body_text=result)
|
||||
except Exception as e:
|
||||
print("[notify] email failed:", e)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_default_question()
|
||||
# optional CLI: allow a custom question
|
||||
custom = " ".join(sys.argv[1:]) if len(sys.argv) > 1 else None
|
||||
run_default_question(custom if custom else None)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user