Lumi/plugins/lumi_ai_web_search/backend/search_tool.js
2026-06-14 05:01:13 +02:00

433 lines
16 KiB
JavaScript

const crypto = require("crypto");
const fs = require("fs");
const path = require("path");
const { ToolCache } = require("./cache");
const { LumiSearchBroker } = require("./lumi_search_broker");
const { PageFetcher } = require("./page_fetcher");
const { ExternalSearchProvider } = require("./provider_adapter");
const { ToolConcurrency, ToolRateLimits } = require("./rate_limits");
const {
REASONS,
formatFetchedPage,
formatResults,
normalizeOrigin
} = require("./result_formatter");
const {
capabilityEnabled,
readSettings,
readStatus,
writeStatus
} = require("./settings");
const { evaluateUrl } = require("./url_policy");
class WebSearchTool {
constructor(options = {}) {
this.dataDir = options.dataDir;
this.now = options.now || Date.now;
this.fetcher = options.fetcher || new PageFetcher(options);
this.broker = options.broker || new LumiSearchBroker({ fetcher: this.fetcher });
this.externalProvider = options.externalProvider || new ExternalSearchProvider({ fetcher: this.fetcher });
this.cache = options.cache || new ToolCache({
directory: path.join(this.dataDir, "cache"),
now: this.now
});
this.rateLimits = options.rateLimits || new ToolRateLimits({ now: this.now });
this.concurrency = options.concurrency || new ToolConcurrency(3, 20);
}
async search(input = {}) {
return this.runConcurrent(input, "search", () => this.performSearch(input));
}
async performSearch(input = {}) {
const context = this.context(input, "search");
const blocked = this.preflight(context);
if (blocked) return this.finish(blocked, context);
const query = cleanQuery(input.query);
if (!query) return this.finish(this.blocked(context, "query_required"), context);
context.query = query;
context.reason = normalizeReason(input.reason);
const maximum = Math.max(1, Math.min(
context.settings.max_results,
Number.parseInt(input.max_results, 10) || context.settings.max_results
));
const cacheKey = JSON.stringify([
"search", query.toLowerCase(), context.reason, String(input.freshness || ""),
maximum, context.settings.provider, context.settings.policy_mode,
context.settings.url_rules, context.origin
]);
const cached = this.cache.get(cacheKey, context.settings.cache_ttl_seconds);
if (cached) return this.finish({
...cached,
cache_hit: true,
timing_ms: this.now() - context.started
}, context);
try {
const provider = context.settings.provider === "external_json"
? this.externalProvider
: this.broker;
const discovered = await provider.search(query, {
settings: context.settings,
freshness: String(input.freshness || "").slice(0, 40)
});
const allowed = [];
const warnings = [...(discovered.warnings || [])];
let blockedResults = 0;
for (const row of discovered.results || []) {
if (allowed.length >= maximum) break;
const policy = await evaluateUrl(row.url, {
mode: context.settings.policy_mode,
rules: context.settings.url_rules,
resolveHost: this.fetcher.resolveHost
});
if (!policy.allowed) {
blockedResults += 1;
warnings.push(`Blocked result from ${safeDomain(row.url) || "unknown source"}: ${policy.reason}.`);
continue;
}
allowed.push({ ...row, url: policy.url });
}
const formatted = formatResults(allowed, {
reason: context.reason,
origin: context.origin,
settings: context.settings,
maxResults: maximum
});
const fetchedPages = [];
if (input.target_url) {
try {
const target = await this.fetcher.fetchPage(input.target_url, context.settings);
fetchedPages.push(formatFetchedPage(target, {
origin: context.origin,
settings: context.settings
}));
} catch (error) {
warnings.push(`Target URL fetch failed: ${safeFailure(error)}.`);
}
}
const adapterErrors = discovered.adapter_errors || [];
const status = formatted.results.length
? adapterErrors.length ? "partial" : "ok"
: blockedResults > 0
? "blocked"
: adapterErrors.length
? "unavailable"
: "no_results";
const result = this.result(context, {
status,
blocked_reason: status === "blocked" ? "all_results_blocked" : null,
provider: discovered.provider || context.settings.provider,
result_count: formatted.results.length,
results: formatted.results,
fetched_pages: fetchedPages,
warnings,
errors: adapterErrors,
condensed_text: formatted.condensed_text,
output_budget_chars: formatted.output_budget_chars,
truncated: formatted.truncated
});
if (context.settings.cache_ttl_seconds > 0 && formatted.results.length) this.cache.set(cacheKey, result);
return this.finish(result, context);
} catch (error) {
return this.finish(this.failure(context, error), context);
}
}
async fetchUrl(input = {}) {
return this.runConcurrent(input, "fetch_url", () => this.fetchCapability(input, "fetch_url"));
}
async summarizeUrl(input = {}) {
return this.runConcurrent(input, "summarize_url", () => this.fetchCapability(input, "summarize_url"));
}
async runConcurrent(input, capability, callback) {
try {
return await this.concurrency.run(callback);
} catch (error) {
if (error?.code !== "concurrency_limited") throw error;
const context = this.context(input, capability);
context.query = cleanQuery(input.query || input.url || input.target_url);
return this.finish(this.blocked(context, "concurrency_limited", {
retry_after_seconds: 2
}), context);
}
}
async fetchCapability(input, capability) {
const context = this.context(input, capability);
const blocked = this.preflight(context);
if (blocked) return this.finish(blocked, context);
const targetUrl = String(input.url || input.target_url || "").trim().slice(0, 2048);
if (!targetUrl) return this.finish(this.blocked(context, "url_required"), context);
context.query = targetUrl;
context.reason = normalizeReason(input.reason || (
capability === "summarize_url" ? "resource_lookup" : "general_lookup"
));
const cacheKey = JSON.stringify([
capability, targetUrl, context.settings.policy_mode,
context.settings.url_rules, context.origin
]);
const cached = this.cache.get(cacheKey, context.settings.cache_ttl_seconds);
if (cached) return this.finish({
...cached,
cache_hit: true,
timing_ms: this.now() - context.started
}, context);
try {
const page = await this.fetcher.fetchPage(targetUrl, context.settings);
const formatted = formatFetchedPage(page, {
origin: context.origin,
settings: context.settings
});
const result = this.result(context, {
status: formatted.extraction_status === "ok" ? "ok" : "no_results",
provider: "direct_http",
result_count: 0,
results: [],
fetched_pages: [formatted],
warnings: formatted.truncated ? ["Extracted page text was truncated to the configured origin budget."] : [],
errors: [],
condensed_text: capability === "summarize_url"
? formatted.extracted_text
: [formatted.title, formatted.description, formatted.extracted_text].filter(Boolean).join("\n")
});
if (context.settings.cache_ttl_seconds > 0 && formatted.extraction_status === "ok") {
this.cache.set(cacheKey, result);
}
return this.finish(result, context);
} catch (error) {
return this.finish(this.failure(context, error), context);
}
}
context(input, capability) {
const settings = readSettings(this.dataDir);
const origin = normalizeOrigin(input.ctx?.origin || input.ctx?.platform || input.origin || "other");
return {
started: this.now(),
capability,
settings,
origin,
actor: String(input.user?.id || input.user?.username || "unknown").slice(0, 120),
username: String(input.user?.username || "").slice(0, 120),
role: input.user?.isAdmin ? "admin" : input.user?.isMod ? "mod" : "user",
server: String(input.ctx?.server_id || input.ctx?.channel_id || "direct").slice(0, 120),
query: "",
reason: normalizeReason(input.reason)
};
}
preflight(context) {
if (!capabilityEnabled(context.settings, context.capability)) {
return this.blocked(context, "capability_disabled");
}
if (!context.settings.allowed_origins.includes(context.origin)) {
return this.blocked(context, "origin_not_allowed");
}
const rate = this.rateLimits.consume(context, context.settings);
if (!rate.allowed) {
return this.blocked(context, "rate_limited", {
retry_after_seconds: rate.retry_after_seconds
});
}
if (context.settings.provider === "external_json" &&
context.capability === "search" &&
!context.settings.external_provider_endpoint) {
return this.blocked(context, "external_provider_not_configured");
}
return null;
}
result(context, patch = {}) {
return {
status: "ok",
query: context.query,
reason: context.reason,
provider: context.settings.provider,
policy_mode: context.settings.policy_mode,
cache_hit: false,
timing_ms: Math.max(0, this.now() - context.started),
result_count: 0,
blocked_reason: null,
results: [],
fetched_pages: [],
warnings: [],
errors: [],
...patch
};
}
blocked(context, reason, patch = {}) {
return this.result(context, {
status: "blocked",
blocked_reason: reason,
...patch
});
}
failure(context, error) {
return this.result(context, {
status: error?.code === "URL_BLOCKED" ? "blocked" : "unavailable",
blocked_reason: error?.blockedReason || null,
errors: [safeFailure(error)]
});
}
finish(result, context) {
result.timing_ms = Math.max(0, this.now() - context.started);
result.user_message = userMessage(result);
const audit = {
actor: context.actor,
username: context.username || null,
role: context.role,
origin: context.origin,
server: context.server,
capability: context.capability,
query_hash: queryHash(context.query),
query_summary: safeQuerySummary(context.query),
reason: context.reason,
provider: result.provider,
policy_mode: result.policy_mode,
policy_decision: ["ok", "partial", "no_results"].includes(result.status) ? "allowed" : "blocked",
result_count: result.result_count,
cache_hit: result.cache_hit,
timing_ms: result.timing_ms,
blocked_reason: result.blocked_reason,
status: result.status
};
this.audit(audit);
this.updateStatus(result, audit);
return result;
}
audit(entry) {
fs.mkdirSync(this.dataDir, { recursive: true });
fs.appendFileSync(path.join(this.dataDir, "audit.jsonl"), `${JSON.stringify({
timestamp: new Date().toISOString(),
...entry
})}\n`);
}
updateStatus(result, audit) {
const current = readStatus(this.dataDir);
const metrics = {
searches: Number(current.metrics?.searches || 0) + (audit.capability === "search" ? 1 : 0),
fetches: Number(current.metrics?.fetches || 0) + (audit.capability !== "search" ? 1 : 0),
successes: Number(current.metrics?.successes || 0) + (result.status === "ok" ? 1 : 0),
blocked: Number(current.metrics?.blocked || 0) + (result.status === "blocked" ? 1 : 0),
cache_hits: Number(current.metrics?.cache_hits || 0) + (result.cache_hit ? 1 : 0),
failures: Number(current.metrics?.failures || 0) + (result.status === "unavailable" ? 1 : 0),
total_search_ms: Number(current.metrics?.total_search_ms || 0) +
(audit.capability === "search" ? result.timing_ms : 0),
total_fetch_ms: Number(current.metrics?.total_fetch_ms || 0) +
(audit.capability !== "search" ? result.timing_ms : 0),
failures_by_reason: {
...(current.metrics?.failures_by_reason || {})
}
};
const failureReason = result.blocked_reason || (
result.status === "unavailable" ? result.errors?.[0]?.reason || result.errors?.[0] || "unavailable" : null
);
if (failureReason) {
const key = String(failureReason).slice(0, 120);
metrics.failures_by_reason[key] = Number(metrics.failures_by_reason[key] || 0) + 1;
}
const recent = [
{
timestamp: new Date().toISOString(),
capability: audit.capability,
origin: audit.origin,
query_summary: audit.query_summary,
provider: audit.provider,
status: audit.status,
blocked_reason: audit.blocked_reason,
timing_ms: audit.timing_ms
},
...(Array.isArray(current.recent) ? current.recent : [])
].slice(0, 20);
writeStatus(this.dataDir, {
provider: result.provider,
provider_health: ["ok", "partial", "no_results"].includes(result.status) ? "available" : "degraded",
last_success_at: result.status === "ok" ? new Date().toISOString() : current.last_success_at || null,
last_error: ["blocked", "unavailable"].includes(result.status)
? result.blocked_reason || result.errors?.[0] || result.status
: null,
cache: this.cache.stats(),
metrics,
recent
});
}
}
function cleanQuery(value) {
return String(value || "").replace(/[\u0000-\u001f\u007f]/g, " ").replace(/\s+/g, " ").trim().slice(0, 500);
}
function normalizeReason(value) {
return REASONS.has(String(value || "")) ? String(value) : "general_lookup";
}
function queryHash(value) {
return value ? crypto.createHash("sha256").update(String(value)).digest("hex").slice(0, 16) : null;
}
function safeQuerySummary(value) {
let text = cleanQuery(value);
try {
const url = new URL(text);
if (url.username || url.password) {
url.username = "";
url.password = "";
text = url.href;
}
} catch {}
text = text.replace(/([?&](?:key|token|secret|password)=)[^&\s]+/gi, "$1[REDACTED]");
return text.slice(0, 120);
}
function safeDomain(value) {
try { return new URL(value).hostname; }
catch { return ""; }
}
function safeFailure(error) {
if (error?.name === "AbortError" || error?.code === "timeout") return "request timed out";
if (error?.code === "response_too_large") return "response exceeded the configured size limit";
if (error?.code === "unsupported_content_type") return "content type is not supported";
if (error?.code === "URL_BLOCKED") return `URL blocked: ${error.blockedReason || "safety policy"}`;
return String(error?.message || "provider unavailable")
.replace(/https?:\/\/\S+/g, "[url]")
.replace(/(key|token|secret|password)=[^&\s]+/gi, "$1=[REDACTED]")
.slice(0, 240);
}
function userMessage(result) {
if (result.status === "ok") return result.condensed_text || "The public lookup completed.";
if (result.status === "partial") return "The lookup returned partial results because one or more public sources were unavailable.";
if (result.status === "no_results") return "No permitted public results were found.";
if (result.blocked_reason === "rate_limited") {
return `Web lookup is rate-limited. Retry in ${result.retry_after_seconds || 60} seconds.`;
}
if (result.blocked_reason === "concurrency_limited") {
return "Web lookup is busy. Retry in a few seconds.";
}
if (result.blocked_reason === "origin_not_allowed") return "Web lookup is not enabled for this platform.";
if (result.blocked_reason === "capability_disabled") return "This web lookup capability is disabled.";
if (result.blocked_reason === "external_provider_not_configured") {
return "The optional external provider is selected but not configured. Select Lumi search broker or configure the endpoint.";
}
if (result.status === "blocked") return "The URL or lookup was blocked by the configured safety policy.";
return "Live public verification is currently unavailable.";
}
module.exports = {
WebSearchTool,
cleanQuery,
normalizeReason,
queryHash,
safeFailure,
safeQuerySummary,
userMessage
};