const http = require("http"); const https = require("https"); const net = require("net"); const { defaultResolveHost, evaluateNetworkTarget, evaluateUrl, isPrivateAddress } = require("./url_policy"); const MAX_RESPONSE_BYTES = 2 * 1024 * 1024; const MAX_PAGE_BYTES = 512 * 1024; class SearchProvider { constructor(options = {}) { this.fetch = options.fetch || null; this.resolveHost = options.resolveHost; } async search(query, options) { const endpoint = buildEndpoint(query, options); const response = await this.request(endpoint, options, true, MAX_RESPONSE_BYTES); const payload = JSON.parse(response.body.toString("utf8")); return normalizeProviderResults(payload, options.provider_adapter); } async fetchPage(url, options) { const response = await this.request(url, options, false, MAX_PAGE_BYTES); const contentType = String(response.headers.get("content-type") || "").toLowerCase(); if (!contentType.includes("text/html") && !contentType.includes("text/plain")) { throw new Error("Page content type is not supported."); } return { url: response.url, text: extractPageText(response.body.toString("utf8")).slice(0, 6000) }; } async request(initialUrl, options, providerRequest, maxBytes) { let current = initialUrl; const providerOrigin = providerRequest ? new URL(initialUrl).origin : null; for (let redirects = 0; redirects <= 3; redirects += 1) { const policy = providerRequest ? await evaluateNetworkTarget(current, { resolveHost: this.resolveHost }) : await evaluateUrl(current, { mode: options.policy_mode, rules: options.url_rules, resolveHost: this.resolveHost }); if (!policy.allowed) throw blockedError(policy.reason); if (providerRequest && new URL(policy.url).origin !== providerOrigin) { throw blockedError("cross_origin_provider_redirect"); } const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), options.search_timeout_ms); timer.unref?.(); try { const headers = { Accept: providerRequest ? "application/json" : "text/html,text/plain;q=0.9", "User-Agent": "Lumi-AI-Web-Search/1.0" }; if (providerRequest && options.provider_api_key) { headers[options.provider_api_key_header] = [ options.provider_api_key_prefix, options.provider_api_key ].filter(Boolean).join(" "); } const response = this.fetch ? await this.fetch(policy.url, { method: "GET", headers, redirect: "manual", signal: controller.signal }) : await safeHttpRequest(policy.url, { headers, timeoutMs: options.search_timeout_ms, maxBytes, resolveHost: this.resolveHost }); if (response.status >= 300 && response.status < 400) { const location = response.headers.get("location"); if (!location) throw new Error("Provider redirect did not include a location."); current = new URL(location, policy.url).href; continue; } if (!response.ok) throw new Error(`Search provider request failed (${response.status}).`); return { url: policy.url, headers: response.headers, body: response.body || await readBounded(response, maxBytes) }; } finally { clearTimeout(timer); } } throw new Error("Search request exceeded the redirect limit."); } } async function safeHttpRequest(value, options = {}) { const url = new URL(value); const hostname = url.hostname.replace(/^\[|\]$/g, ""); const resolveHost = options.resolveHost || defaultResolveHost; const addresses = await resolveHost(hostname); const address = addresses.find((entry) => !isPrivateAddress(entry)); if (!address) throw blockedError("private_network"); const transport = url.protocol === "https:" ? https : http; return new Promise((resolve, reject) => { const request = transport.request({ protocol: url.protocol, hostname, port: url.port || undefined, method: "GET", path: `${url.pathname}${url.search}`, headers: options.headers, servername: url.protocol === "https:" && !net.isIP(hostname) ? hostname : undefined, lookup: (_hostname, _options, callback) => callback(null, address, net.isIP(address)) }, (response) => { const chunks = []; let size = 0; response.on("data", (chunk) => { size += chunk.length; if (size > options.maxBytes) { request.destroy(new Error("Provider response is too large.")); return; } chunks.push(chunk); }); response.on("end", () => resolve({ ok: response.statusCode >= 200 && response.statusCode < 300, status: response.statusCode, headers: { get: (name) => response.headers[String(name).toLowerCase()] || null }, body: Buffer.concat(chunks) })); }); request.setTimeout(options.timeoutMs, () => { const error = new Error("Search provider timed out."); error.name = "AbortError"; request.destroy(error); }); request.on("error", reject); request.end(); }); } function buildEndpoint(query, settings) { if (!settings.provider_endpoint) throw new Error("Search provider endpoint is not configured."); const endpoint = settings.provider_endpoint.includes("{query}") ? settings.provider_endpoint.replaceAll("{query}", encodeURIComponent(query)) : settings.provider_endpoint; const url = new URL(endpoint); if (!settings.provider_endpoint.includes("{query}")) { url.searchParams.set(settings.provider_query_parameter || "q", query); } url.searchParams.set("format", "json"); url.searchParams.set("safesearch", safeSearchValue(settings.safe_search)); url.searchParams.set("count", String(settings.max_results)); if (settings.freshness) url.searchParams.set("time_range", String(settings.freshness).slice(0, 32)); return url.href; } function normalizeProviderResults(payload, adapter) { const rows = adapter === "searxng_json" ? payload?.results : payload?.results || payload?.items || payload?.web?.results?.value; if (!Array.isArray(rows)) throw new Error("Search provider response does not contain a supported result list."); return rows.map((row, index) => ({ title: sanitizeText(row.title || row.name || "Untitled result", 240), url: String(row.url || row.link || ""), snippet: sanitizeText(row.content || row.snippet || row.description || "", 800), source_type: sanitizeText(row.source_type || row.category || row.engine || "", 80) || null, date: normalizeDate(row.publishedDate || row.published_date || row.date), relevance_score: finiteScore(row.score, index) })).filter((row) => row.url); } async function readBounded(response, maxBytes) { const declared = Number(response.headers.get("content-length")); if (Number.isFinite(declared) && declared > maxBytes) throw new Error("Provider response is too large."); const buffer = Buffer.from(await response.arrayBuffer()); if (buffer.length > maxBytes) throw new Error("Provider response is too large."); return buffer; } function extractPageText(value) { return sanitizeText( String(value) .replace(/]*>[\s\S]*?<\/script>/gi, " ") .replace(/]*>[\s\S]*?<\/style>/gi, " ") .replace(/<[^>]+>/g, " "), 12000 ); } function sanitizeText(value, maximum) { return decodeEntities(String(value || "").replace(/<[^>]+>/g, " ")) .replace(/[\u0000-\u001f\u007f]/g, " ") .replace(/\s+/g, " ") .trim() .slice(0, maximum); } function decodeEntities(value) { return value .replaceAll("&", "&") .replaceAll("<", "<") .replaceAll(">", ">") .replaceAll(""", "\"") .replaceAll("'", "'"); } function normalizeDate(value) { if (!value) return null; const date = new Date(value); return Number.isNaN(date.getTime()) ? null : date.toISOString(); } function finiteScore(value, index) { const number = Number(value); return Number.isFinite(number) ? number : Math.max(0, 1 - index * 0.1); } function safeSearchValue(level) { if (level === "off") return "0"; if (level === "moderate") return "1"; return "2"; } function blockedError(reason) { const error = new Error(`URL blocked by policy: ${reason}.`); error.code = "URL_BLOCKED"; error.blockedReason = reason; return error; } module.exports = { MAX_PAGE_BYTES, MAX_RESPONSE_BYTES, SearchProvider, blockedError, buildEndpoint, extractPageText, normalizeProviderResults, readBounded, sanitizeText, safeHttpRequest };