99 lines
3.8 KiB
Python
99 lines
3.8 KiB
Python
import azure.functions as func
|
|
import logging
|
|
import os
|
|
import json
|
|
import requests
|
|
from typing import List, Any
|
|
|
|
app = func.FunctionApp()
|
|
|
|
# Simple health check
|
|
@app.route(route="ping", auth_level=func.AuthLevel.ANONYMOUS)
|
|
def Ping(req: func.HttpRequest) -> func.HttpResponse:
|
|
return func.HttpResponse("pong", status_code=200)
|
|
|
|
|
|
@app.blob_trigger(
|
|
arg_name="blob",
|
|
path="bank-logs/intesa/{name}.json", # container/path pattern
|
|
connection="BLOB_CONN" # App Setting name (NOT the raw connection string)
|
|
)
|
|
def BlobIngest(blob: func.InputStream):
|
|
"""
|
|
Reads JSON, NDJSON, or JSON array files from Blob Storage.
|
|
Handles UTF-8 BOM safely. Posts parsed items to AGENT_API_URL/ingest.
|
|
"""
|
|
logging.info("BlobIngest fired: name=%s length=%s", blob.name, blob.length)
|
|
|
|
# Read whole blob and decode using utf-8-sig to strip a BOM if present
|
|
raw: bytes = blob.read()
|
|
# strict = raise if decoding is invalid; change to "ignore" only if you truly want to skip bad bytes
|
|
text: str = raw.decode("utf-8-sig", errors="strict")
|
|
|
|
items: List[Any] = []
|
|
|
|
# Try whole-document JSON first (object or array). If that fails, fall back to NDJSON.
|
|
stripped = text.lstrip("\ufeff").strip() # extra safety if BOM sneaks through
|
|
try:
|
|
if stripped:
|
|
first = stripped[0]
|
|
if first in "[{":
|
|
parsed = json.loads(stripped)
|
|
items = parsed if isinstance(parsed, list) else [parsed]
|
|
else:
|
|
# Not an array or object at top level -> treat as NDJSON
|
|
raise ValueError("Top-level not array/object; using NDJSON mode.")
|
|
else:
|
|
logging.warning("Blob %s is empty.", blob.name)
|
|
items = []
|
|
except Exception:
|
|
# NDJSON fallback (one JSON object per line)
|
|
ndjson_items: List[Any] = []
|
|
for i, line in enumerate(text.splitlines(), start=1):
|
|
s = line.lstrip("\ufeff").strip() # strip BOM at line-start + trim whitespace
|
|
if not s:
|
|
continue
|
|
try:
|
|
ndjson_items.append(json.loads(s))
|
|
except json.JSONDecodeError as e:
|
|
logging.error("Invalid JSON at line %d in %s: %s | line=%r",
|
|
i, blob.name, e, s[:200])
|
|
# Re-raise to allow retry/poison for truly bad content
|
|
raise
|
|
items = ndjson_items
|
|
|
|
logging.info("Parsed %d item(s) from blob '%s'.", len(items), blob.name)
|
|
|
|
# If no agent URL configured, just log and exit gracefully
|
|
api = os.environ.get("AGENT_API_URL")
|
|
if not api:
|
|
logging.warning("AGENT_API_URL not set; skipping POST. (Parsed %d items).", len(items))
|
|
return
|
|
|
|
# Prepare POST
|
|
url = api.rstrip('/') + "/ingest"
|
|
batch_size = int(os.environ.get("INGEST_BATCH_SIZE", "500"))
|
|
timeout = float(os.environ.get("HTTP_TIMEOUT_SECONDS", "25"))
|
|
|
|
def chunks(seq: List[Any], n: int):
|
|
for i in range(0, len(seq), n):
|
|
yield seq[i:i + n]
|
|
|
|
# Send in batches to avoid oversized payloads
|
|
sent = 0
|
|
for batch in chunks(items, batch_size):
|
|
try:
|
|
r = requests.post(url, json=batch, timeout=timeout)
|
|
logging.info("POST %s [%d items] -> %d", url, len(batch), r.status_code)
|
|
if r.status_code >= 400:
|
|
logging.error("Agent API error %d: %s", r.status_code, r.text[:500])
|
|
# Raise to trigger retry/poison if desired
|
|
raise RuntimeError(f"Agent API returned {r.status_code}")
|
|
sent += len(batch)
|
|
except Exception:
|
|
logging.exception("HTTP call to Agent API failed after sending %d/%d items.", sent, len(items))
|
|
# Re-raise so the runtime can retry and dead-letter if needed
|
|
raise
|
|
|
|
logging.info("Completed BlobIngest for %s: posted %d item(s).", blob.name, sent)
|