mirror of
https://gitea.toothfairyai.com/ToothFairyAI/tf_code.git
synced 2026-03-30 05:43:55 +00:00
182 lines
4.5 KiB
TypeScript
182 lines
4.5 KiB
TypeScript
import { z } from "zod"
|
|
import { Tool } from "./tool"
|
|
import TurndownService from "turndown"
|
|
import DESCRIPTION from "./webfetch.txt"
|
|
|
|
const MAX_RESPONSE_SIZE = 5 * 1024 * 1024 // 5MB
|
|
const DEFAULT_TIMEOUT = 30 * 1000 // 30 seconds
|
|
const MAX_TIMEOUT = 120 * 1000 // 2 minutes
|
|
|
|
export const WebFetchTool = Tool.define({
|
|
id: "webfetch",
|
|
description: DESCRIPTION,
|
|
parameters: z.object({
|
|
url: z.string().describe("The URL to fetch content from"),
|
|
format: z
|
|
.enum(["text", "markdown", "html"])
|
|
.describe(
|
|
"The format to return the content in (text, markdown, or html)",
|
|
),
|
|
timeout: z
|
|
.number()
|
|
.min(0)
|
|
.max(MAX_TIMEOUT / 1000)
|
|
.describe("Optional timeout in seconds (max 120)")
|
|
.optional(),
|
|
}),
|
|
async execute(params, ctx) {
|
|
// Validate URL
|
|
if (
|
|
!params.url.startsWith("http://") &&
|
|
!params.url.startsWith("https://")
|
|
) {
|
|
throw new Error("URL must start with http:// or https://")
|
|
}
|
|
|
|
const timeout = Math.min(
|
|
(params.timeout ?? DEFAULT_TIMEOUT / 1000) * 1000,
|
|
MAX_TIMEOUT,
|
|
)
|
|
|
|
const controller = new AbortController()
|
|
const timeoutId = setTimeout(() => controller.abort(), timeout)
|
|
|
|
const response = await fetch(params.url, {
|
|
signal: AbortSignal.any([controller.signal, ctx.abort]),
|
|
headers: {
|
|
"User-Agent":
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
Accept:
|
|
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
},
|
|
})
|
|
|
|
clearTimeout(timeoutId)
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`Request failed with status code: ${response.status}`)
|
|
}
|
|
|
|
// Check content length
|
|
const contentLength = response.headers.get("content-length")
|
|
if (contentLength && parseInt(contentLength) > MAX_RESPONSE_SIZE) {
|
|
throw new Error("Response too large (exceeds 5MB limit)")
|
|
}
|
|
|
|
const arrayBuffer = await response.arrayBuffer()
|
|
if (arrayBuffer.byteLength > MAX_RESPONSE_SIZE) {
|
|
throw new Error("Response too large (exceeds 5MB limit)")
|
|
}
|
|
|
|
const content = new TextDecoder().decode(arrayBuffer)
|
|
const contentType = response.headers.get("content-type") || ""
|
|
|
|
const title = `${params.url} (${contentType})`
|
|
switch (params.format) {
|
|
case "text":
|
|
if (contentType.includes("text/html")) {
|
|
const text = await extractTextFromHTML(content)
|
|
return {
|
|
output: text,
|
|
metadata: {
|
|
title,
|
|
},
|
|
}
|
|
}
|
|
return {
|
|
output: content,
|
|
metadata: {
|
|
title,
|
|
},
|
|
}
|
|
|
|
case "markdown":
|
|
if (contentType.includes("text/html")) {
|
|
const markdown = convertHTMLToMarkdown(content)
|
|
return {
|
|
output: markdown,
|
|
metadata: {
|
|
title,
|
|
},
|
|
}
|
|
}
|
|
return {
|
|
output: "```\n" + content + "\n```",
|
|
metadata: {
|
|
title,
|
|
},
|
|
}
|
|
|
|
case "html":
|
|
return {
|
|
output: content,
|
|
metadata: {
|
|
title,
|
|
},
|
|
}
|
|
|
|
default:
|
|
return {
|
|
output: content,
|
|
metadata: {
|
|
title,
|
|
},
|
|
}
|
|
}
|
|
},
|
|
})
|
|
|
|
async function extractTextFromHTML(html: string) {
|
|
let text = ""
|
|
let skipContent = false
|
|
|
|
const rewriter = new HTMLRewriter()
|
|
.on("script, style, noscript, iframe, object, embed", {
|
|
element() {
|
|
skipContent = true
|
|
},
|
|
text() {
|
|
// Skip text content inside these elements
|
|
},
|
|
})
|
|
.on("*", {
|
|
element(element) {
|
|
// Reset skip flag when entering other elements
|
|
if (
|
|
![
|
|
"script",
|
|
"style",
|
|
"noscript",
|
|
"iframe",
|
|
"object",
|
|
"embed",
|
|
].includes(element.tagName)
|
|
) {
|
|
skipContent = false
|
|
}
|
|
},
|
|
text(input) {
|
|
if (!skipContent) {
|
|
text += input.text
|
|
}
|
|
},
|
|
})
|
|
.transform(new Response(html))
|
|
|
|
await rewriter.text()
|
|
return text.trim()
|
|
}
|
|
|
|
function convertHTMLToMarkdown(html: string): string {
|
|
const turndownService = new TurndownService({
|
|
headingStyle: "atx",
|
|
hr: "---",
|
|
bulletListMarker: "-",
|
|
codeBlockStyle: "fenced",
|
|
emDelimiter: "*",
|
|
})
|
|
turndownService.remove(["script", "style", "meta", "link"])
|
|
return turndownService.turndown(html)
|
|
}
|