Lumi/plugins/lumi_ai_web_search/backend/provider_adapter.js
2026-06-13 21:32:36 +02:00

249 lines
8.7 KiB
JavaScript

const http = require("http");
const https = require("https");
const net = require("net");
const {
defaultResolveHost,
evaluateNetworkTarget,
evaluateUrl,
isPrivateAddress
} = require("./url_policy");
const MAX_RESPONSE_BYTES = 2 * 1024 * 1024;
const MAX_PAGE_BYTES = 512 * 1024;
class SearchProvider {
constructor(options = {}) {
this.fetch = options.fetch || null;
this.resolveHost = options.resolveHost;
}
async search(query, options) {
const endpoint = buildEndpoint(query, options);
const response = await this.request(endpoint, options, true, MAX_RESPONSE_BYTES);
const payload = JSON.parse(response.body.toString("utf8"));
return normalizeProviderResults(payload, options.provider_adapter);
}
async fetchPage(url, options) {
const response = await this.request(url, options, false, MAX_PAGE_BYTES);
const contentType = String(response.headers.get("content-type") || "").toLowerCase();
if (!contentType.includes("text/html") && !contentType.includes("text/plain")) {
throw new Error("Page content type is not supported.");
}
return {
url: response.url,
text: extractPageText(response.body.toString("utf8")).slice(0, 6000)
};
}
async request(initialUrl, options, providerRequest, maxBytes) {
let current = initialUrl;
const providerOrigin = providerRequest ? new URL(initialUrl).origin : null;
for (let redirects = 0; redirects <= 3; redirects += 1) {
const policy = providerRequest
? await evaluateNetworkTarget(current, { resolveHost: this.resolveHost })
: await evaluateUrl(current, {
mode: options.policy_mode,
rules: options.url_rules,
resolveHost: this.resolveHost
});
if (!policy.allowed) throw blockedError(policy.reason);
if (providerRequest && new URL(policy.url).origin !== providerOrigin) {
throw blockedError("cross_origin_provider_redirect");
}
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), options.search_timeout_ms);
timer.unref?.();
try {
const headers = {
Accept: providerRequest ? "application/json" : "text/html,text/plain;q=0.9",
"User-Agent": "Lumi-AI-Web-Search/1.0"
};
if (providerRequest && options.provider_api_key) {
headers[options.provider_api_key_header] = [
options.provider_api_key_prefix,
options.provider_api_key
].filter(Boolean).join(" ");
}
const response = this.fetch
? await this.fetch(policy.url, {
method: "GET",
headers,
redirect: "manual",
signal: controller.signal
})
: await safeHttpRequest(policy.url, {
headers,
timeoutMs: options.search_timeout_ms,
maxBytes,
resolveHost: this.resolveHost
});
if (response.status >= 300 && response.status < 400) {
const location = response.headers.get("location");
if (!location) throw new Error("Provider redirect did not include a location.");
current = new URL(location, policy.url).href;
continue;
}
if (!response.ok) throw new Error(`Search provider request failed (${response.status}).`);
return {
url: policy.url,
headers: response.headers,
body: response.body || await readBounded(response, maxBytes)
};
} finally {
clearTimeout(timer);
}
}
throw new Error("Search request exceeded the redirect limit.");
}
}
async function safeHttpRequest(value, options = {}) {
const url = new URL(value);
const hostname = url.hostname.replace(/^\[|\]$/g, "");
const resolveHost = options.resolveHost || defaultResolveHost;
const addresses = await resolveHost(hostname);
const address = addresses.find((entry) => !isPrivateAddress(entry));
if (!address) throw blockedError("private_network");
const transport = url.protocol === "https:" ? https : http;
return new Promise((resolve, reject) => {
const request = transport.request({
protocol: url.protocol,
hostname,
port: url.port || undefined,
method: "GET",
path: `${url.pathname}${url.search}`,
headers: options.headers,
servername: url.protocol === "https:" && !net.isIP(hostname) ? hostname : undefined,
lookup: (_hostname, _options, callback) => callback(null, address, net.isIP(address))
}, (response) => {
const chunks = [];
let size = 0;
response.on("data", (chunk) => {
size += chunk.length;
if (size > options.maxBytes) {
request.destroy(new Error("Provider response is too large."));
return;
}
chunks.push(chunk);
});
response.on("end", () => resolve({
ok: response.statusCode >= 200 && response.statusCode < 300,
status: response.statusCode,
headers: { get: (name) => response.headers[String(name).toLowerCase()] || null },
body: Buffer.concat(chunks)
}));
});
request.setTimeout(options.timeoutMs, () => {
const error = new Error("Search provider timed out.");
error.name = "AbortError";
request.destroy(error);
});
request.on("error", reject);
request.end();
});
}
function buildEndpoint(query, settings) {
if (!settings.provider_endpoint) throw new Error("Search provider endpoint is not configured.");
const endpoint = settings.provider_endpoint.includes("{query}")
? settings.provider_endpoint.replaceAll("{query}", encodeURIComponent(query))
: settings.provider_endpoint;
const url = new URL(endpoint);
if (!settings.provider_endpoint.includes("{query}")) {
url.searchParams.set(settings.provider_query_parameter || "q", query);
}
url.searchParams.set("format", "json");
url.searchParams.set("safesearch", safeSearchValue(settings.safe_search));
url.searchParams.set("count", String(settings.max_results));
if (settings.freshness) url.searchParams.set("time_range", String(settings.freshness).slice(0, 32));
return url.href;
}
function normalizeProviderResults(payload, adapter) {
const rows = adapter === "searxng_json"
? payload?.results
: payload?.results || payload?.items || payload?.web?.results?.value;
if (!Array.isArray(rows)) throw new Error("Search provider response does not contain a supported result list.");
return rows.map((row, index) => ({
title: sanitizeText(row.title || row.name || "Untitled result", 240),
url: String(row.url || row.link || ""),
snippet: sanitizeText(row.content || row.snippet || row.description || "", 800),
source_type: sanitizeText(row.source_type || row.category || row.engine || "", 80) || null,
date: normalizeDate(row.publishedDate || row.published_date || row.date),
relevance_score: finiteScore(row.score, index)
})).filter((row) => row.url);
}
async function readBounded(response, maxBytes) {
const declared = Number(response.headers.get("content-length"));
if (Number.isFinite(declared) && declared > maxBytes) throw new Error("Provider response is too large.");
const buffer = Buffer.from(await response.arrayBuffer());
if (buffer.length > maxBytes) throw new Error("Provider response is too large.");
return buffer;
}
function extractPageText(value) {
return sanitizeText(
String(value)
.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, " ")
.replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, " ")
.replace(/<[^>]+>/g, " "),
12000
);
}
function sanitizeText(value, maximum) {
return decodeEntities(String(value || "").replace(/<[^>]+>/g, " "))
.replace(/[\u0000-\u001f\u007f]/g, " ")
.replace(/\s+/g, " ")
.trim()
.slice(0, maximum);
}
function decodeEntities(value) {
return value
.replaceAll("&amp;", "&")
.replaceAll("&lt;", "<")
.replaceAll("&gt;", ">")
.replaceAll("&quot;", "\"")
.replaceAll("&#39;", "'");
}
function normalizeDate(value) {
if (!value) return null;
const date = new Date(value);
return Number.isNaN(date.getTime()) ? null : date.toISOString();
}
function finiteScore(value, index) {
const number = Number(value);
return Number.isFinite(number) ? number : Math.max(0, 1 - index * 0.1);
}
function safeSearchValue(level) {
if (level === "off") return "0";
if (level === "moderate") return "1";
return "2";
}
function blockedError(reason) {
const error = new Error(`URL blocked by policy: ${reason}.`);
error.code = "URL_BLOCKED";
error.blockedReason = reason;
return error;
}
module.exports = {
MAX_PAGE_BYTES,
MAX_RESPONSE_BYTES,
SearchProvider,
blockedError,
buildEndpoint,
extractPageText,
normalizeProviderResults,
readBounded,
sanitizeText,
safeHttpRequest
};