307 lines
12 KiB
Python
307 lines
12 KiB
Python
import os, sys, glob, json, ujson, gzip, pathlib, re
|
|
from typing import List, Dict, Any
|
|
|
|
from dotenv import load_dotenv
|
|
from notify import send_email
|
|
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
|
|
from langchain_community.vectorstores import FAISS
|
|
from langchain_core.documents import Document
|
|
from langchain.tools import Tool
|
|
from langchain.agents import AgentExecutor, create_tool_calling_agent
|
|
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
|
|
|
# ----- load .env -----
|
|
load_dotenv(os.getenv("ENV_FILE", ".env"))
|
|
|
|
# ----- normalize endpoint -----
|
|
def _norm_endpoint(ep: str | None) -> str:
|
|
if not ep: return ""
|
|
ep = ep.strip().rstrip("/")
|
|
ep = re.sub(r"/openai(?:/v\d+(?:\.\d+)?(?:-\w+)?)?$", "", ep)
|
|
return ep + "/"
|
|
|
|
AZ_ENDPOINT = _norm_endpoint(os.getenv("AZURE_OPENAI_ENDPOINT") or os.getenv("AOAI_ENDPOINT"))
|
|
AZ_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") or os.getenv("AOAI_API_KEY") or os.getenv("OPENAI_API_KEY")
|
|
AZ_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION") or os.getenv("AOAI_API_VERSION") or "2025-01-01-preview"
|
|
AZ_CHAT_DEPLOY = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT") or os.getenv("AOAI_CHAT_DEPLOYMENT") or "gpt-4o-mini"
|
|
AZ_EMBED_DEPLOY = os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT") or os.getenv("AOAI_EMBED_DEPLOYMENT") or ""
|
|
|
|
# ----- local data config -----
|
|
CHUNK_DIR = os.getenv("CHUNK_DIR", "./out")
|
|
BLOB_DIR = os.getenv("BLOB_DIR", "")
|
|
TOP_K = int(os.getenv("TOP_K", "12"))
|
|
|
|
# ---------- LLM and embeddings ----------
|
|
def make_llm(temperature: float = 0.2) -> AzureChatOpenAI:
|
|
if not AZ_ENDPOINT or not AZ_API_KEY:
|
|
raise RuntimeError("Set AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_API_KEY (or AOAI_* equivalents).")
|
|
return AzureChatOpenAI(
|
|
azure_endpoint=AZ_ENDPOINT,
|
|
api_key=AZ_API_KEY,
|
|
api_version=AZ_API_VERSION,
|
|
azure_deployment=AZ_CHAT_DEPLOY,
|
|
temperature=temperature,
|
|
)
|
|
|
|
def make_embeddings() -> AzureOpenAIEmbeddings | None:
|
|
if not AZ_EMBED_DEPLOY:
|
|
return None
|
|
return AzureOpenAIEmbeddings(
|
|
azure_endpoint=AZ_ENDPOINT,
|
|
api_key=AZ_API_KEY,
|
|
api_version=AZ_API_VERSION,
|
|
azure_deployment=AZ_EMBED_DEPLOY,
|
|
)
|
|
|
|
# ---------- Load JSONL chunk files ----------
|
|
def _iter_chunk_files() -> List[pathlib.Path]:
|
|
paths: List[pathlib.Path] = []
|
|
if CHUNK_DIR and pathlib.Path(CHUNK_DIR).exists():
|
|
paths += [pathlib.Path(p) for p in glob.glob(f"{CHUNK_DIR}/chunk_*.jsonl*")]
|
|
paths += [pathlib.Path(p) for p in glob.glob(f"{CHUNK_DIR}/hec_*.jsonl*")]
|
|
if BLOB_DIR and pathlib.Path(BLOB_DIR).exists():
|
|
paths += [pathlib.Path(p) for p in glob.glob(f"{BLOB_DIR}/**/chunk_*.jsonl*", recursive=True)]
|
|
paths += [pathlib.Path(p) for p in glob.glob(f"{BLOB_DIR}/**/hec_*.jsonl*", recursive=True)]
|
|
return sorted(paths, key=lambda p: p.stat().st_mtime, reverse=True)
|
|
|
|
def _read_jsonl(path: pathlib.Path) -> List[Dict[str, Any]]:
|
|
data = path.read_bytes()
|
|
if path.suffix == ".gz":
|
|
try:
|
|
data = gzip.decompress(data)
|
|
except Exception:
|
|
pass
|
|
out: List[Dict[str, Any]] = []
|
|
for ln in data.splitlines():
|
|
if not ln.strip(): continue
|
|
try:
|
|
out.append(ujson.loads(ln))
|
|
except Exception:
|
|
continue
|
|
return out
|
|
|
|
def _normalize_event(rec: Dict[str, Any]) -> Dict[str, Any]:
|
|
return rec.get("event", rec)
|
|
|
|
def _evt_to_text(evt: Dict[str, Any]) -> str:
|
|
keys = ["event_type","transaction_id","step","status","importo","divisa","istantaneo",
|
|
"spese_commissioni","causale","data_pagamento","iban_origin_masked","iban_dest_masked",
|
|
"vop_check","vop_score","bic_swift","latency_ms","device","os","browser","geo"]
|
|
parts = [f"{k}={evt[k]}" for k in keys if evt.get(k) is not None]
|
|
return "bonifico | " + " | ".join(parts)
|
|
|
|
# ---------- Vector store ----------
|
|
def build_vectorstore(limit_files: int = 20):
|
|
embs = make_embeddings()
|
|
if embs is None:
|
|
raise RuntimeError("No embeddings deployment set. Export AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT.")
|
|
files = _iter_chunk_files()[:limit_files]
|
|
if not files:
|
|
raise RuntimeError("No chunk files found; set CHUNK_DIR or BLOB_DIR.")
|
|
docs, meta_index = [], []
|
|
for fp in files:
|
|
rows = _read_jsonl(fp)
|
|
for rec in rows:
|
|
evt = _normalize_event(rec)
|
|
docs.append(Document(
|
|
page_content=_evt_to_text(evt),
|
|
metadata={"file": fp.name, **{k: evt.get(k) for k in ("transaction_id","step","status")}}
|
|
))
|
|
meta_index.append(evt)
|
|
vs = FAISS.from_documents(docs, embs)
|
|
return vs, meta_index
|
|
|
|
# ---------- Tools ----------
|
|
def stats_tool_impl(query: str = "") -> str:
|
|
"""
|
|
Filters supported in `query` (space-separated):
|
|
status:<accepted|pending|rejected>
|
|
step:<compila|conferma|esito>
|
|
divisa:<EUR|USD|GBP>
|
|
instant:<true|false>
|
|
vop:<no_match|close_match|match>
|
|
min_amount:<float>
|
|
iban_country:<2-letter e.g., IT>
|
|
Examples:
|
|
'status:rejected min_amount:10000'
|
|
'vop:no_match step:esito'
|
|
'divisa:EUR instant:true'
|
|
"""
|
|
# load recent events into memory
|
|
files = _iter_chunk_files()[:20]
|
|
events = []
|
|
for fp in files:
|
|
for rec in _read_jsonl(fp):
|
|
events.append(_normalize_event(rec))
|
|
|
|
q = query.lower()
|
|
def _kv(key, pat=r"([^\s]+)"):
|
|
m = re.search(fr"{key}:{pat}", q)
|
|
return m.group(1) if m else None
|
|
|
|
status_f = _kv("status")
|
|
step_f = _kv("step")
|
|
div_f = _kv("divisa")
|
|
vop_f = _kv("vop")
|
|
country = _kv("iban_country")
|
|
instant_s = _kv("instant")
|
|
min_amt_s = _kv("min_amount")
|
|
min_amt = float(min_amt_s) if min_amt_s else 0.0
|
|
|
|
inst_f = None
|
|
if instant_s in {"true","false"}:
|
|
inst_f = (instant_s == "true")
|
|
|
|
def _boolish(x):
|
|
if isinstance(x, bool): return x
|
|
if isinstance(x, str): return x.lower() in {"true","1","yes"}
|
|
return False
|
|
|
|
def keep(e):
|
|
try: amt = float(e.get("importo", 0) or 0)
|
|
except: amt = 0.0
|
|
if amt < min_amt: return False
|
|
if status_f and (str(e.get("status","")).lower() != status_f): return False
|
|
if step_f and (str(e.get("step","")).lower() != step_f): return False
|
|
if div_f and (str(e.get("divisa","")).upper() != div_f.upper()): return False
|
|
if vop_f:
|
|
v = str(e.get("vop_check","")).lower()
|
|
if v != vop_f: return False
|
|
if inst_f is not None and _boolish(e.get("instantaneo") or e.get("istantaneo")) != inst_f:
|
|
return False
|
|
if country:
|
|
iban = (e.get("iban_dest_masked") or e.get("iban_origin_masked") or "").upper()
|
|
if not iban.startswith(country.upper()):
|
|
return False
|
|
return True
|
|
|
|
filtered = [e for e in events if keep(e)]
|
|
|
|
total = len(filtered)
|
|
rej = sum(1 for e in filtered if str(e.get("status","")).lower()=="rejected")
|
|
amt_sum = 0.0; hi = 0.0; hi_tx = None
|
|
for e in filtered:
|
|
try: amt = float(e.get("importo", 0) or 0)
|
|
except: amt = 0.0
|
|
amt_sum += amt
|
|
if amt > hi:
|
|
hi, hi_tx = amt, e.get("transaction_id")
|
|
return f"events={total}, rejected={rej}, rejection_rate={round(rej/max(total,1),3)}, total_amount={round(amt_sum,2)}, max_amount={hi} (tx={hi_tx})"
|
|
|
|
def retrieve_tool_impl(question: str) -> str:
|
|
vs, _ = build_vectorstore()
|
|
docs = vs.similarity_search(question, k=TOP_K)
|
|
return "\n".join(f\"[{i+1}] {d.page_content}\" for i, d in enumerate(docs))
|
|
|
|
def raw_sample_tool_impl(arg: str = "") -> str:
|
|
"""
|
|
Return a few raw JSON events from the newest chunks.
|
|
Accepts the same filters as get_stats PLUS optional 'n:<int>'.
|
|
"""
|
|
q = (arg or "").lower()
|
|
|
|
def _kv(key, pat=r"([^\s]+)"):
|
|
m = re.search(fr"{key}:{pat}", q)
|
|
return m.group(1) if m else None
|
|
|
|
n_s = _kv("n", r"(\\d+)")
|
|
n = int(n_s) if n_s else 5
|
|
status_f = _kv("status")
|
|
step_f = _kv("step")
|
|
div_f = _kv("divisa")
|
|
vop_f = _kv("vop")
|
|
country = _kv("iban_country")
|
|
instant_s = _kv("instant")
|
|
min_amt_s = _kv("min_amount")
|
|
min_amt = float(min_amt_s) if min_amt_s else 0.0
|
|
|
|
inst_f = None
|
|
if instant_s in {"true","false"}:
|
|
inst_f = (instant_s == "true")
|
|
|
|
def _boolish(x):
|
|
if isinstance(x, bool): return x
|
|
if isinstance(x, str): return x.lower() in {"true","1","yes"}
|
|
return False
|
|
|
|
def keep(e):
|
|
try: amt = float(e.get("importo", 0) or 0)
|
|
except: amt = 0.0
|
|
if amt < min_amt: return False
|
|
if status_f and (str(e.get("status","")).lower() != status_f): return False
|
|
if step_f and (str(e.get("step","")).lower() != step_f): return False
|
|
if div_f and (str(e.get("divisa","")).upper() != div_f.upper()): return False
|
|
if vop_f:
|
|
v = str(e.get("vop_check","")).lower()
|
|
if v != vop_f: return False
|
|
if inst_f is not None and _boolish(e.get("instantaneo") or e.get("istantaneo")) != inst_f:
|
|
return False
|
|
if country:
|
|
iban = (e.get("iban_dest_masked") or e.get("iban_origin_masked") or "").upper()
|
|
if not iban.startswith(country.upper()):
|
|
return False
|
|
return True
|
|
|
|
files = _iter_chunk_files()
|
|
out = []
|
|
for fp in files:
|
|
for rec in _read_jsonl(fp):
|
|
evt = _normalize_event(rec)
|
|
if keep(evt):
|
|
out.append(json.dumps(evt, ensure_ascii=False))
|
|
if len(out) >= n:
|
|
break
|
|
if len(out) >= n:
|
|
break
|
|
|
|
if not out:
|
|
return "(no matching events)"
|
|
return "\\n".join(out)
|
|
|
|
# ---------- Build the agent ----------
|
|
def build_agent():
|
|
llm = make_llm(temperature=0.2)
|
|
tools = [
|
|
Tool(name="get_stats", func=stats_tool_impl,
|
|
description="Quick stats over recent events. Example: 'status:rejected min_amount:10000 step:esito'."),
|
|
Tool(name="raw_samples", func=raw_sample_tool_impl,
|
|
description="Return a few raw JSON events. Accepts filters like get_stats and 'n:<int>'."),
|
|
]
|
|
if AZ_EMBED_DEPLOY:
|
|
tools.append(Tool(name="retrieve_similar", func=retrieve_tool_impl,
|
|
description="Semantic search over logs. Ask a question about bonifico logs."))
|
|
|
|
system = """You are a payments log analyst. Use the tools to inspect recent Splunk-derived logs for 'bonifico' events.
|
|
- Prefer 'get_stats' for quick metrics (rejection rate, totals).
|
|
- Use 'retrieve_similar' (if available) to pull relevant examples before concluding.
|
|
- When asked for anomalies, treat as suspicious: rejected EUR >= 10,000, 'vop_no_match', invalid IBAN/BIC, unusual spikes.
|
|
Return a short, structured report with: Findings, Evidence, and Recommended actions."""
|
|
|
|
prompt = ChatPromptTemplate.from_messages([
|
|
("system", system),
|
|
MessagesPlaceholder("chat_history"),
|
|
("human", "{input}"),
|
|
MessagesPlaceholder("agent_scratchpad"),
|
|
])
|
|
agent = create_tool_calling_agent(llm, tools, prompt)
|
|
return AgentExecutor(agent=agent, tools=tools, verbose=True, handle_parsing_errors=True)
|
|
|
|
def run_default_question(question_override: str | None = None):
|
|
agent = build_agent()
|
|
question = question_override or (
|
|
"Scan the latest chunks. List any anomalies "
|
|
"(rejected EUR >= 10000, vop_no_match, invalid IBAN/BIC). "
|
|
"Give a brief summary and next steps."
|
|
)
|
|
out = agent.invoke({"input": question, "chat_history": []})
|
|
result = out.get("output", "")
|
|
print("\\n=== AGENT OUTPUT ===\\n", result)
|
|
try:
|
|
send_email(subject="[Intesa Logs] Agent Report", body_text=result)
|
|
except Exception as e:
|
|
print("[notify] email failed:", e)
|
|
|
|
if __name__ == "__main__":
|
|
custom = " ".join(sys.argv[1:]) if len(sys.argv) > 1 else None
|
|
run_default_question(custom if custom else None)
|