const crypto = require("crypto"); const fs = require("fs"); const path = require("path"); const { ToolCache } = require("./cache"); const { LumiSearchBroker } = require("./lumi_search_broker"); const { PageFetcher } = require("./page_fetcher"); const { ExternalSearchProvider } = require("./provider_adapter"); const { ToolConcurrency, ToolRateLimits } = require("./rate_limits"); const { REASONS, formatFetchedPage, formatResults, normalizeOrigin } = require("./result_formatter"); const { capabilityEnabled, readSettings, readStatus, writeStatus } = require("./settings"); const { evaluateUrl } = require("./url_policy"); class WebSearchTool { constructor(options = {}) { this.dataDir = options.dataDir; this.now = options.now || Date.now; this.fetcher = options.fetcher || new PageFetcher(options); this.broker = options.broker || new LumiSearchBroker({ fetcher: this.fetcher }); this.externalProvider = options.externalProvider || new ExternalSearchProvider({ fetcher: this.fetcher }); this.cache = options.cache || new ToolCache({ directory: path.join(this.dataDir, "cache"), now: this.now }); this.rateLimits = options.rateLimits || new ToolRateLimits({ now: this.now }); this.concurrency = options.concurrency || new ToolConcurrency(3, 20); } async search(input = {}) { return this.runConcurrent(input, "search", () => this.performSearch(input)); } async performSearch(input = {}) { const context = this.context(input, "search"); const blocked = this.preflight(context); if (blocked) return this.finish(blocked, context); const query = cleanQuery(input.query); if (!query) return this.finish(this.blocked(context, "query_required"), context); context.query = query; context.reason = normalizeReason(input.reason); const maximum = Math.max(1, Math.min( context.settings.max_results, Number.parseInt(input.max_results, 10) || context.settings.max_results )); const cacheKey = JSON.stringify([ "search", query.toLowerCase(), context.reason, String(input.freshness || ""), maximum, context.settings.provider, context.settings.policy_mode, context.settings.url_rules, context.origin ]); const cached = this.cache.get(cacheKey, context.settings.cache_ttl_seconds); if (cached) return this.finish({ ...cached, cache_hit: true, timing_ms: this.now() - context.started }, context); try { const provider = context.settings.provider === "external_json" ? this.externalProvider : this.broker; const discovered = await provider.search(query, { settings: context.settings, freshness: String(input.freshness || "").slice(0, 40) }); const allowed = []; const warnings = [...(discovered.warnings || [])]; let blockedResults = 0; for (const row of discovered.results || []) { if (allowed.length >= maximum) break; const policy = await evaluateUrl(row.url, { mode: context.settings.policy_mode, rules: context.settings.url_rules, resolveHost: this.fetcher.resolveHost }); if (!policy.allowed) { blockedResults += 1; warnings.push(`Blocked result from ${safeDomain(row.url) || "unknown source"}: ${policy.reason}.`); continue; } allowed.push({ ...row, url: policy.url }); } const formatted = formatResults(allowed, { reason: context.reason, origin: context.origin, settings: context.settings, maxResults: maximum }); const fetchedPages = []; if (input.target_url) { try { const target = await this.fetcher.fetchPage(input.target_url, context.settings); fetchedPages.push(formatFetchedPage(target, { origin: context.origin, settings: context.settings })); } catch (error) { warnings.push(`Target URL fetch failed: ${safeFailure(error)}.`); } } const adapterErrors = discovered.adapter_errors || []; const status = formatted.results.length ? adapterErrors.length ? "partial" : "ok" : blockedResults > 0 ? "blocked" : adapterErrors.length ? "unavailable" : "no_results"; const result = this.result(context, { status, blocked_reason: status === "blocked" ? "all_results_blocked" : null, provider: discovered.provider || context.settings.provider, result_count: formatted.results.length, results: formatted.results, fetched_pages: fetchedPages, warnings, errors: adapterErrors, condensed_text: formatted.condensed_text, output_budget_chars: formatted.output_budget_chars, truncated: formatted.truncated }); if (context.settings.cache_ttl_seconds > 0 && formatted.results.length) this.cache.set(cacheKey, result); return this.finish(result, context); } catch (error) { return this.finish(this.failure(context, error), context); } } async fetchUrl(input = {}) { return this.runConcurrent(input, "fetch_url", () => this.fetchCapability(input, "fetch_url")); } async summarizeUrl(input = {}) { return this.runConcurrent(input, "summarize_url", () => this.fetchCapability(input, "summarize_url")); } async runConcurrent(input, capability, callback) { try { return await this.concurrency.run(callback); } catch (error) { if (error?.code !== "concurrency_limited") throw error; const context = this.context(input, capability); context.query = cleanQuery(input.query || input.url || input.target_url); return this.finish(this.blocked(context, "concurrency_limited", { retry_after_seconds: 2 }), context); } } async fetchCapability(input, capability) { const context = this.context(input, capability); const blocked = this.preflight(context); if (blocked) return this.finish(blocked, context); const targetUrl = String(input.url || input.target_url || "").trim().slice(0, 2048); if (!targetUrl) return this.finish(this.blocked(context, "url_required"), context); context.query = targetUrl; context.reason = normalizeReason(input.reason || ( capability === "summarize_url" ? "resource_lookup" : "general_lookup" )); const cacheKey = JSON.stringify([ capability, targetUrl, context.settings.policy_mode, context.settings.url_rules, context.origin ]); const cached = this.cache.get(cacheKey, context.settings.cache_ttl_seconds); if (cached) return this.finish({ ...cached, cache_hit: true, timing_ms: this.now() - context.started }, context); try { const page = await this.fetcher.fetchPage(targetUrl, context.settings); const formatted = formatFetchedPage(page, { origin: context.origin, settings: context.settings }); const result = this.result(context, { status: formatted.extraction_status === "ok" ? "ok" : "no_results", provider: "direct_http", result_count: 0, results: [], fetched_pages: [formatted], warnings: formatted.truncated ? ["Extracted page text was truncated to the configured origin budget."] : [], errors: [], condensed_text: capability === "summarize_url" ? formatted.extracted_text : [formatted.title, formatted.description, formatted.extracted_text].filter(Boolean).join("\n") }); if (context.settings.cache_ttl_seconds > 0 && formatted.extraction_status === "ok") { this.cache.set(cacheKey, result); } return this.finish(result, context); } catch (error) { return this.finish(this.failure(context, error), context); } } context(input, capability) { const settings = readSettings(this.dataDir); const origin = normalizeOrigin(input.ctx?.origin || input.ctx?.platform || input.origin || "other"); return { started: this.now(), capability, settings, origin, actor: String(input.user?.id || input.user?.username || "unknown").slice(0, 120), username: String(input.user?.username || "").slice(0, 120), role: input.user?.isAdmin ? "admin" : input.user?.isMod ? "mod" : "user", server: String(input.ctx?.server_id || input.ctx?.channel_id || "direct").slice(0, 120), query: "", reason: normalizeReason(input.reason) }; } preflight(context) { if (!capabilityEnabled(context.settings, context.capability)) { return this.blocked(context, "capability_disabled"); } if (!context.settings.allowed_origins.includes(context.origin)) { return this.blocked(context, "origin_not_allowed"); } const rate = this.rateLimits.consume(context, context.settings); if (!rate.allowed) { return this.blocked(context, "rate_limited", { retry_after_seconds: rate.retry_after_seconds }); } if (context.settings.provider === "external_json" && context.capability === "search" && !context.settings.external_provider_endpoint) { return this.blocked(context, "external_provider_not_configured"); } return null; } result(context, patch = {}) { return { status: "ok", query: context.query, reason: context.reason, provider: context.settings.provider, policy_mode: context.settings.policy_mode, cache_hit: false, timing_ms: Math.max(0, this.now() - context.started), result_count: 0, blocked_reason: null, results: [], fetched_pages: [], warnings: [], errors: [], ...patch }; } blocked(context, reason, patch = {}) { return this.result(context, { status: "blocked", blocked_reason: reason, ...patch }); } failure(context, error) { return this.result(context, { status: error?.code === "URL_BLOCKED" ? "blocked" : "unavailable", blocked_reason: error?.blockedReason || null, errors: [safeFailure(error)] }); } finish(result, context) { result.timing_ms = Math.max(0, this.now() - context.started); result.user_message = userMessage(result); const audit = { actor: context.actor, username: context.username || null, role: context.role, origin: context.origin, server: context.server, capability: context.capability, query_hash: queryHash(context.query), query_summary: safeQuerySummary(context.query), reason: context.reason, provider: result.provider, policy_mode: result.policy_mode, policy_decision: ["ok", "partial", "no_results"].includes(result.status) ? "allowed" : "blocked", result_count: result.result_count, cache_hit: result.cache_hit, timing_ms: result.timing_ms, blocked_reason: result.blocked_reason, status: result.status }; this.audit(audit); this.updateStatus(result, audit); return result; } audit(entry) { fs.mkdirSync(this.dataDir, { recursive: true }); fs.appendFileSync(path.join(this.dataDir, "audit.jsonl"), `${JSON.stringify({ timestamp: new Date().toISOString(), ...entry })}\n`); } updateStatus(result, audit) { const current = readStatus(this.dataDir); const metrics = { searches: Number(current.metrics?.searches || 0) + (audit.capability === "search" ? 1 : 0), fetches: Number(current.metrics?.fetches || 0) + (audit.capability !== "search" ? 1 : 0), successes: Number(current.metrics?.successes || 0) + (result.status === "ok" ? 1 : 0), blocked: Number(current.metrics?.blocked || 0) + (result.status === "blocked" ? 1 : 0), cache_hits: Number(current.metrics?.cache_hits || 0) + (result.cache_hit ? 1 : 0), failures: Number(current.metrics?.failures || 0) + (result.status === "unavailable" ? 1 : 0), total_search_ms: Number(current.metrics?.total_search_ms || 0) + (audit.capability === "search" ? result.timing_ms : 0), total_fetch_ms: Number(current.metrics?.total_fetch_ms || 0) + (audit.capability !== "search" ? result.timing_ms : 0), failures_by_reason: { ...(current.metrics?.failures_by_reason || {}) } }; const failureReason = result.blocked_reason || ( result.status === "unavailable" ? result.errors?.[0]?.reason || result.errors?.[0] || "unavailable" : null ); if (failureReason) { const key = String(failureReason).slice(0, 120); metrics.failures_by_reason[key] = Number(metrics.failures_by_reason[key] || 0) + 1; } const recent = [ { timestamp: new Date().toISOString(), capability: audit.capability, origin: audit.origin, query_summary: audit.query_summary, provider: audit.provider, status: audit.status, blocked_reason: audit.blocked_reason, timing_ms: audit.timing_ms }, ...(Array.isArray(current.recent) ? current.recent : []) ].slice(0, 20); writeStatus(this.dataDir, { provider: result.provider, provider_health: ["ok", "partial", "no_results"].includes(result.status) ? "available" : "degraded", last_success_at: result.status === "ok" ? new Date().toISOString() : current.last_success_at || null, last_error: ["blocked", "unavailable"].includes(result.status) ? result.blocked_reason || result.errors?.[0] || result.status : null, cache: this.cache.stats(), metrics, recent }); } } function cleanQuery(value) { return String(value || "").replace(/[\u0000-\u001f\u007f]/g, " ").replace(/\s+/g, " ").trim().slice(0, 500); } function normalizeReason(value) { return REASONS.has(String(value || "")) ? String(value) : "general_lookup"; } function queryHash(value) { return value ? crypto.createHash("sha256").update(String(value)).digest("hex").slice(0, 16) : null; } function safeQuerySummary(value) { let text = cleanQuery(value); try { const url = new URL(text); if (url.username || url.password) { url.username = ""; url.password = ""; text = url.href; } } catch {} text = text.replace(/([?&](?:key|token|secret|password)=)[^&\s]+/gi, "$1[REDACTED]"); return text.slice(0, 120); } function safeDomain(value) { try { return new URL(value).hostname; } catch { return ""; } } function safeFailure(error) { if (error?.name === "AbortError" || error?.code === "timeout") return "request timed out"; if (error?.code === "response_too_large") return "response exceeded the configured size limit"; if (error?.code === "unsupported_content_type") return "content type is not supported"; if (error?.code === "URL_BLOCKED") return `URL blocked: ${error.blockedReason || "safety policy"}`; return String(error?.message || "provider unavailable") .replace(/https?:\/\/\S+/g, "[url]") .replace(/(key|token|secret|password)=[^&\s]+/gi, "$1=[REDACTED]") .slice(0, 240); } function userMessage(result) { if (result.status === "ok") return result.condensed_text || "The public lookup completed."; if (result.status === "partial") return "The lookup returned partial results because one or more public sources were unavailable."; if (result.status === "no_results") return "No permitted public results were found."; if (result.blocked_reason === "rate_limited") { return `Web lookup is rate-limited. Retry in ${result.retry_after_seconds || 60} seconds.`; } if (result.blocked_reason === "concurrency_limited") { return "Web lookup is busy. Retry in a few seconds."; } if (result.blocked_reason === "origin_not_allowed") return "Web lookup is not enabled for this platform."; if (result.blocked_reason === "capability_disabled") return "This web lookup capability is disabled."; if (result.blocked_reason === "external_provider_not_configured") { return "The optional external provider is selected but not configured. Select Lumi search broker or configure the endpoint."; } if (result.status === "blocked") return "The URL or lookup was blocked by the configured safety policy."; return "Live public verification is currently unavailable."; } module.exports = { WebSearchTool, cleanQuery, normalizeReason, queryHash, safeFailure, safeQuerySummary, userMessage };