433 lines
16 KiB
JavaScript
433 lines
16 KiB
JavaScript
const crypto = require("crypto");
|
|
const fs = require("fs");
|
|
const path = require("path");
|
|
const { ToolCache } = require("./cache");
|
|
const { LumiSearchBroker } = require("./lumi_search_broker");
|
|
const { PageFetcher } = require("./page_fetcher");
|
|
const { ExternalSearchProvider } = require("./provider_adapter");
|
|
const { ToolConcurrency, ToolRateLimits } = require("./rate_limits");
|
|
const {
|
|
REASONS,
|
|
formatFetchedPage,
|
|
formatResults,
|
|
normalizeOrigin
|
|
} = require("./result_formatter");
|
|
const {
|
|
capabilityEnabled,
|
|
readSettings,
|
|
readStatus,
|
|
writeStatus
|
|
} = require("./settings");
|
|
const { evaluateUrl } = require("./url_policy");
|
|
|
|
class WebSearchTool {
|
|
constructor(options = {}) {
|
|
this.dataDir = options.dataDir;
|
|
this.now = options.now || Date.now;
|
|
this.fetcher = options.fetcher || new PageFetcher(options);
|
|
this.broker = options.broker || new LumiSearchBroker({ fetcher: this.fetcher });
|
|
this.externalProvider = options.externalProvider || new ExternalSearchProvider({ fetcher: this.fetcher });
|
|
this.cache = options.cache || new ToolCache({
|
|
directory: path.join(this.dataDir, "cache"),
|
|
now: this.now
|
|
});
|
|
this.rateLimits = options.rateLimits || new ToolRateLimits({ now: this.now });
|
|
this.concurrency = options.concurrency || new ToolConcurrency(3, 20);
|
|
}
|
|
|
|
async search(input = {}) {
|
|
return this.runConcurrent(input, "search", () => this.performSearch(input));
|
|
}
|
|
|
|
async performSearch(input = {}) {
|
|
const context = this.context(input, "search");
|
|
const blocked = this.preflight(context);
|
|
if (blocked) return this.finish(blocked, context);
|
|
const query = cleanQuery(input.query);
|
|
if (!query) return this.finish(this.blocked(context, "query_required"), context);
|
|
context.query = query;
|
|
context.reason = normalizeReason(input.reason);
|
|
const maximum = Math.max(1, Math.min(
|
|
context.settings.max_results,
|
|
Number.parseInt(input.max_results, 10) || context.settings.max_results
|
|
));
|
|
const cacheKey = JSON.stringify([
|
|
"search", query.toLowerCase(), context.reason, String(input.freshness || ""),
|
|
maximum, context.settings.provider, context.settings.policy_mode,
|
|
context.settings.url_rules, context.origin
|
|
]);
|
|
const cached = this.cache.get(cacheKey, context.settings.cache_ttl_seconds);
|
|
if (cached) return this.finish({
|
|
...cached,
|
|
cache_hit: true,
|
|
timing_ms: this.now() - context.started
|
|
}, context);
|
|
try {
|
|
const provider = context.settings.provider === "external_json"
|
|
? this.externalProvider
|
|
: this.broker;
|
|
const discovered = await provider.search(query, {
|
|
settings: context.settings,
|
|
freshness: String(input.freshness || "").slice(0, 40)
|
|
});
|
|
const allowed = [];
|
|
const warnings = [...(discovered.warnings || [])];
|
|
let blockedResults = 0;
|
|
for (const row of discovered.results || []) {
|
|
if (allowed.length >= maximum) break;
|
|
const policy = await evaluateUrl(row.url, {
|
|
mode: context.settings.policy_mode,
|
|
rules: context.settings.url_rules,
|
|
resolveHost: this.fetcher.resolveHost
|
|
});
|
|
if (!policy.allowed) {
|
|
blockedResults += 1;
|
|
warnings.push(`Blocked result from ${safeDomain(row.url) || "unknown source"}: ${policy.reason}.`);
|
|
continue;
|
|
}
|
|
allowed.push({ ...row, url: policy.url });
|
|
}
|
|
const formatted = formatResults(allowed, {
|
|
reason: context.reason,
|
|
origin: context.origin,
|
|
settings: context.settings,
|
|
maxResults: maximum
|
|
});
|
|
const fetchedPages = [];
|
|
if (input.target_url) {
|
|
try {
|
|
const target = await this.fetcher.fetchPage(input.target_url, context.settings);
|
|
fetchedPages.push(formatFetchedPage(target, {
|
|
origin: context.origin,
|
|
settings: context.settings
|
|
}));
|
|
} catch (error) {
|
|
warnings.push(`Target URL fetch failed: ${safeFailure(error)}.`);
|
|
}
|
|
}
|
|
const adapterErrors = discovered.adapter_errors || [];
|
|
const status = formatted.results.length
|
|
? adapterErrors.length ? "partial" : "ok"
|
|
: blockedResults > 0
|
|
? "blocked"
|
|
: adapterErrors.length
|
|
? "unavailable"
|
|
: "no_results";
|
|
const result = this.result(context, {
|
|
status,
|
|
blocked_reason: status === "blocked" ? "all_results_blocked" : null,
|
|
provider: discovered.provider || context.settings.provider,
|
|
result_count: formatted.results.length,
|
|
results: formatted.results,
|
|
fetched_pages: fetchedPages,
|
|
warnings,
|
|
errors: adapterErrors,
|
|
condensed_text: formatted.condensed_text,
|
|
output_budget_chars: formatted.output_budget_chars,
|
|
truncated: formatted.truncated
|
|
});
|
|
if (context.settings.cache_ttl_seconds > 0 && formatted.results.length) this.cache.set(cacheKey, result);
|
|
return this.finish(result, context);
|
|
} catch (error) {
|
|
return this.finish(this.failure(context, error), context);
|
|
}
|
|
}
|
|
|
|
async fetchUrl(input = {}) {
|
|
return this.runConcurrent(input, "fetch_url", () => this.fetchCapability(input, "fetch_url"));
|
|
}
|
|
|
|
async summarizeUrl(input = {}) {
|
|
return this.runConcurrent(input, "summarize_url", () => this.fetchCapability(input, "summarize_url"));
|
|
}
|
|
|
|
async runConcurrent(input, capability, callback) {
|
|
try {
|
|
return await this.concurrency.run(callback);
|
|
} catch (error) {
|
|
if (error?.code !== "concurrency_limited") throw error;
|
|
const context = this.context(input, capability);
|
|
context.query = cleanQuery(input.query || input.url || input.target_url);
|
|
return this.finish(this.blocked(context, "concurrency_limited", {
|
|
retry_after_seconds: 2
|
|
}), context);
|
|
}
|
|
}
|
|
|
|
async fetchCapability(input, capability) {
|
|
const context = this.context(input, capability);
|
|
const blocked = this.preflight(context);
|
|
if (blocked) return this.finish(blocked, context);
|
|
const targetUrl = String(input.url || input.target_url || "").trim().slice(0, 2048);
|
|
if (!targetUrl) return this.finish(this.blocked(context, "url_required"), context);
|
|
context.query = targetUrl;
|
|
context.reason = normalizeReason(input.reason || (
|
|
capability === "summarize_url" ? "resource_lookup" : "general_lookup"
|
|
));
|
|
const cacheKey = JSON.stringify([
|
|
capability, targetUrl, context.settings.policy_mode,
|
|
context.settings.url_rules, context.origin
|
|
]);
|
|
const cached = this.cache.get(cacheKey, context.settings.cache_ttl_seconds);
|
|
if (cached) return this.finish({
|
|
...cached,
|
|
cache_hit: true,
|
|
timing_ms: this.now() - context.started
|
|
}, context);
|
|
try {
|
|
const page = await this.fetcher.fetchPage(targetUrl, context.settings);
|
|
const formatted = formatFetchedPage(page, {
|
|
origin: context.origin,
|
|
settings: context.settings
|
|
});
|
|
const result = this.result(context, {
|
|
status: formatted.extraction_status === "ok" ? "ok" : "no_results",
|
|
provider: "direct_http",
|
|
result_count: 0,
|
|
results: [],
|
|
fetched_pages: [formatted],
|
|
warnings: formatted.truncated ? ["Extracted page text was truncated to the configured origin budget."] : [],
|
|
errors: [],
|
|
condensed_text: capability === "summarize_url"
|
|
? formatted.extracted_text
|
|
: [formatted.title, formatted.description, formatted.extracted_text].filter(Boolean).join("\n")
|
|
});
|
|
if (context.settings.cache_ttl_seconds > 0 && formatted.extraction_status === "ok") {
|
|
this.cache.set(cacheKey, result);
|
|
}
|
|
return this.finish(result, context);
|
|
} catch (error) {
|
|
return this.finish(this.failure(context, error), context);
|
|
}
|
|
}
|
|
|
|
context(input, capability) {
|
|
const settings = readSettings(this.dataDir);
|
|
const origin = normalizeOrigin(input.ctx?.origin || input.ctx?.platform || input.origin || "other");
|
|
return {
|
|
started: this.now(),
|
|
capability,
|
|
settings,
|
|
origin,
|
|
actor: String(input.user?.id || input.user?.username || "unknown").slice(0, 120),
|
|
username: String(input.user?.username || "").slice(0, 120),
|
|
role: input.user?.isAdmin ? "admin" : input.user?.isMod ? "mod" : "user",
|
|
server: String(input.ctx?.server_id || input.ctx?.channel_id || "direct").slice(0, 120),
|
|
query: "",
|
|
reason: normalizeReason(input.reason)
|
|
};
|
|
}
|
|
|
|
preflight(context) {
|
|
if (!capabilityEnabled(context.settings, context.capability)) {
|
|
return this.blocked(context, "capability_disabled");
|
|
}
|
|
if (!context.settings.allowed_origins.includes(context.origin)) {
|
|
return this.blocked(context, "origin_not_allowed");
|
|
}
|
|
const rate = this.rateLimits.consume(context, context.settings);
|
|
if (!rate.allowed) {
|
|
return this.blocked(context, "rate_limited", {
|
|
retry_after_seconds: rate.retry_after_seconds
|
|
});
|
|
}
|
|
if (context.settings.provider === "external_json" &&
|
|
context.capability === "search" &&
|
|
!context.settings.external_provider_endpoint) {
|
|
return this.blocked(context, "external_provider_not_configured");
|
|
}
|
|
return null;
|
|
}
|
|
|
|
result(context, patch = {}) {
|
|
return {
|
|
status: "ok",
|
|
query: context.query,
|
|
reason: context.reason,
|
|
provider: context.settings.provider,
|
|
policy_mode: context.settings.policy_mode,
|
|
cache_hit: false,
|
|
timing_ms: Math.max(0, this.now() - context.started),
|
|
result_count: 0,
|
|
blocked_reason: null,
|
|
results: [],
|
|
fetched_pages: [],
|
|
warnings: [],
|
|
errors: [],
|
|
...patch
|
|
};
|
|
}
|
|
|
|
blocked(context, reason, patch = {}) {
|
|
return this.result(context, {
|
|
status: "blocked",
|
|
blocked_reason: reason,
|
|
...patch
|
|
});
|
|
}
|
|
|
|
failure(context, error) {
|
|
return this.result(context, {
|
|
status: error?.code === "URL_BLOCKED" ? "blocked" : "unavailable",
|
|
blocked_reason: error?.blockedReason || null,
|
|
errors: [safeFailure(error)]
|
|
});
|
|
}
|
|
|
|
finish(result, context) {
|
|
result.timing_ms = Math.max(0, this.now() - context.started);
|
|
result.user_message = userMessage(result);
|
|
const audit = {
|
|
actor: context.actor,
|
|
username: context.username || null,
|
|
role: context.role,
|
|
origin: context.origin,
|
|
server: context.server,
|
|
capability: context.capability,
|
|
query_hash: queryHash(context.query),
|
|
query_summary: safeQuerySummary(context.query),
|
|
reason: context.reason,
|
|
provider: result.provider,
|
|
policy_mode: result.policy_mode,
|
|
policy_decision: ["ok", "partial", "no_results"].includes(result.status) ? "allowed" : "blocked",
|
|
result_count: result.result_count,
|
|
cache_hit: result.cache_hit,
|
|
timing_ms: result.timing_ms,
|
|
blocked_reason: result.blocked_reason,
|
|
status: result.status
|
|
};
|
|
this.audit(audit);
|
|
this.updateStatus(result, audit);
|
|
return result;
|
|
}
|
|
|
|
audit(entry) {
|
|
fs.mkdirSync(this.dataDir, { recursive: true });
|
|
fs.appendFileSync(path.join(this.dataDir, "audit.jsonl"), `${JSON.stringify({
|
|
timestamp: new Date().toISOString(),
|
|
...entry
|
|
})}\n`);
|
|
}
|
|
|
|
updateStatus(result, audit) {
|
|
const current = readStatus(this.dataDir);
|
|
const metrics = {
|
|
searches: Number(current.metrics?.searches || 0) + (audit.capability === "search" ? 1 : 0),
|
|
fetches: Number(current.metrics?.fetches || 0) + (audit.capability !== "search" ? 1 : 0),
|
|
successes: Number(current.metrics?.successes || 0) + (result.status === "ok" ? 1 : 0),
|
|
blocked: Number(current.metrics?.blocked || 0) + (result.status === "blocked" ? 1 : 0),
|
|
cache_hits: Number(current.metrics?.cache_hits || 0) + (result.cache_hit ? 1 : 0),
|
|
failures: Number(current.metrics?.failures || 0) + (result.status === "unavailable" ? 1 : 0),
|
|
total_search_ms: Number(current.metrics?.total_search_ms || 0) +
|
|
(audit.capability === "search" ? result.timing_ms : 0),
|
|
total_fetch_ms: Number(current.metrics?.total_fetch_ms || 0) +
|
|
(audit.capability !== "search" ? result.timing_ms : 0),
|
|
failures_by_reason: {
|
|
...(current.metrics?.failures_by_reason || {})
|
|
}
|
|
};
|
|
const failureReason = result.blocked_reason || (
|
|
result.status === "unavailable" ? result.errors?.[0]?.reason || result.errors?.[0] || "unavailable" : null
|
|
);
|
|
if (failureReason) {
|
|
const key = String(failureReason).slice(0, 120);
|
|
metrics.failures_by_reason[key] = Number(metrics.failures_by_reason[key] || 0) + 1;
|
|
}
|
|
const recent = [
|
|
{
|
|
timestamp: new Date().toISOString(),
|
|
capability: audit.capability,
|
|
origin: audit.origin,
|
|
query_summary: audit.query_summary,
|
|
provider: audit.provider,
|
|
status: audit.status,
|
|
blocked_reason: audit.blocked_reason,
|
|
timing_ms: audit.timing_ms
|
|
},
|
|
...(Array.isArray(current.recent) ? current.recent : [])
|
|
].slice(0, 20);
|
|
writeStatus(this.dataDir, {
|
|
provider: result.provider,
|
|
provider_health: ["ok", "partial", "no_results"].includes(result.status) ? "available" : "degraded",
|
|
last_success_at: result.status === "ok" ? new Date().toISOString() : current.last_success_at || null,
|
|
last_error: ["blocked", "unavailable"].includes(result.status)
|
|
? result.blocked_reason || result.errors?.[0] || result.status
|
|
: null,
|
|
cache: this.cache.stats(),
|
|
metrics,
|
|
recent
|
|
});
|
|
}
|
|
}
|
|
|
|
function cleanQuery(value) {
|
|
return String(value || "").replace(/[\u0000-\u001f\u007f]/g, " ").replace(/\s+/g, " ").trim().slice(0, 500);
|
|
}
|
|
|
|
function normalizeReason(value) {
|
|
return REASONS.has(String(value || "")) ? String(value) : "general_lookup";
|
|
}
|
|
|
|
function queryHash(value) {
|
|
return value ? crypto.createHash("sha256").update(String(value)).digest("hex").slice(0, 16) : null;
|
|
}
|
|
|
|
function safeQuerySummary(value) {
|
|
let text = cleanQuery(value);
|
|
try {
|
|
const url = new URL(text);
|
|
if (url.username || url.password) {
|
|
url.username = "";
|
|
url.password = "";
|
|
text = url.href;
|
|
}
|
|
} catch {}
|
|
text = text.replace(/([?&](?:key|token|secret|password)=)[^&\s]+/gi, "$1[REDACTED]");
|
|
return text.slice(0, 120);
|
|
}
|
|
|
|
function safeDomain(value) {
|
|
try { return new URL(value).hostname; }
|
|
catch { return ""; }
|
|
}
|
|
|
|
function safeFailure(error) {
|
|
if (error?.name === "AbortError" || error?.code === "timeout") return "request timed out";
|
|
if (error?.code === "response_too_large") return "response exceeded the configured size limit";
|
|
if (error?.code === "unsupported_content_type") return "content type is not supported";
|
|
if (error?.code === "URL_BLOCKED") return `URL blocked: ${error.blockedReason || "safety policy"}`;
|
|
return String(error?.message || "provider unavailable")
|
|
.replace(/https?:\/\/\S+/g, "[url]")
|
|
.replace(/(key|token|secret|password)=[^&\s]+/gi, "$1=[REDACTED]")
|
|
.slice(0, 240);
|
|
}
|
|
|
|
function userMessage(result) {
|
|
if (result.status === "ok") return result.condensed_text || "The public lookup completed.";
|
|
if (result.status === "partial") return "The lookup returned partial results because one or more public sources were unavailable.";
|
|
if (result.status === "no_results") return "No permitted public results were found.";
|
|
if (result.blocked_reason === "rate_limited") {
|
|
return `Web lookup is rate-limited. Retry in ${result.retry_after_seconds || 60} seconds.`;
|
|
}
|
|
if (result.blocked_reason === "concurrency_limited") {
|
|
return "Web lookup is busy. Retry in a few seconds.";
|
|
}
|
|
if (result.blocked_reason === "origin_not_allowed") return "Web lookup is not enabled for this platform.";
|
|
if (result.blocked_reason === "capability_disabled") return "This web lookup capability is disabled.";
|
|
if (result.blocked_reason === "external_provider_not_configured") {
|
|
return "The optional external provider is selected but not configured. Select Lumi search broker or configure the endpoint.";
|
|
}
|
|
if (result.status === "blocked") return "The URL or lookup was blocked by the configured safety policy.";
|
|
return "Live public verification is currently unavailable.";
|
|
}
|
|
|
|
module.exports = {
|
|
WebSearchTool,
|
|
cleanQuery,
|
|
normalizeReason,
|
|
queryHash,
|
|
safeFailure,
|
|
safeQuerySummary,
|
|
userMessage
|
|
};
|