249 lines
8.7 KiB
JavaScript
249 lines
8.7 KiB
JavaScript
const http = require("http");
|
|
const https = require("https");
|
|
const net = require("net");
|
|
const {
|
|
defaultResolveHost,
|
|
evaluateNetworkTarget,
|
|
evaluateUrl,
|
|
isPrivateAddress
|
|
} = require("./url_policy");
|
|
|
|
const MAX_RESPONSE_BYTES = 2 * 1024 * 1024;
|
|
const MAX_PAGE_BYTES = 512 * 1024;
|
|
|
|
class SearchProvider {
|
|
constructor(options = {}) {
|
|
this.fetch = options.fetch || null;
|
|
this.resolveHost = options.resolveHost;
|
|
}
|
|
|
|
async search(query, options) {
|
|
const endpoint = buildEndpoint(query, options);
|
|
const response = await this.request(endpoint, options, true, MAX_RESPONSE_BYTES);
|
|
const payload = JSON.parse(response.body.toString("utf8"));
|
|
return normalizeProviderResults(payload, options.provider_adapter);
|
|
}
|
|
|
|
async fetchPage(url, options) {
|
|
const response = await this.request(url, options, false, MAX_PAGE_BYTES);
|
|
const contentType = String(response.headers.get("content-type") || "").toLowerCase();
|
|
if (!contentType.includes("text/html") && !contentType.includes("text/plain")) {
|
|
throw new Error("Page content type is not supported.");
|
|
}
|
|
return {
|
|
url: response.url,
|
|
text: extractPageText(response.body.toString("utf8")).slice(0, 6000)
|
|
};
|
|
}
|
|
|
|
async request(initialUrl, options, providerRequest, maxBytes) {
|
|
let current = initialUrl;
|
|
const providerOrigin = providerRequest ? new URL(initialUrl).origin : null;
|
|
for (let redirects = 0; redirects <= 3; redirects += 1) {
|
|
const policy = providerRequest
|
|
? await evaluateNetworkTarget(current, { resolveHost: this.resolveHost })
|
|
: await evaluateUrl(current, {
|
|
mode: options.policy_mode,
|
|
rules: options.url_rules,
|
|
resolveHost: this.resolveHost
|
|
});
|
|
if (!policy.allowed) throw blockedError(policy.reason);
|
|
if (providerRequest && new URL(policy.url).origin !== providerOrigin) {
|
|
throw blockedError("cross_origin_provider_redirect");
|
|
}
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), options.search_timeout_ms);
|
|
timer.unref?.();
|
|
try {
|
|
const headers = {
|
|
Accept: providerRequest ? "application/json" : "text/html,text/plain;q=0.9",
|
|
"User-Agent": "Lumi-AI-Web-Search/1.0"
|
|
};
|
|
if (providerRequest && options.provider_api_key) {
|
|
headers[options.provider_api_key_header] = [
|
|
options.provider_api_key_prefix,
|
|
options.provider_api_key
|
|
].filter(Boolean).join(" ");
|
|
}
|
|
const response = this.fetch
|
|
? await this.fetch(policy.url, {
|
|
method: "GET",
|
|
headers,
|
|
redirect: "manual",
|
|
signal: controller.signal
|
|
})
|
|
: await safeHttpRequest(policy.url, {
|
|
headers,
|
|
timeoutMs: options.search_timeout_ms,
|
|
maxBytes,
|
|
resolveHost: this.resolveHost
|
|
});
|
|
if (response.status >= 300 && response.status < 400) {
|
|
const location = response.headers.get("location");
|
|
if (!location) throw new Error("Provider redirect did not include a location.");
|
|
current = new URL(location, policy.url).href;
|
|
continue;
|
|
}
|
|
if (!response.ok) throw new Error(`Search provider request failed (${response.status}).`);
|
|
return {
|
|
url: policy.url,
|
|
headers: response.headers,
|
|
body: response.body || await readBounded(response, maxBytes)
|
|
};
|
|
} finally {
|
|
clearTimeout(timer);
|
|
}
|
|
}
|
|
throw new Error("Search request exceeded the redirect limit.");
|
|
}
|
|
}
|
|
|
|
async function safeHttpRequest(value, options = {}) {
|
|
const url = new URL(value);
|
|
const hostname = url.hostname.replace(/^\[|\]$/g, "");
|
|
const resolveHost = options.resolveHost || defaultResolveHost;
|
|
const addresses = await resolveHost(hostname);
|
|
const address = addresses.find((entry) => !isPrivateAddress(entry));
|
|
if (!address) throw blockedError("private_network");
|
|
const transport = url.protocol === "https:" ? https : http;
|
|
return new Promise((resolve, reject) => {
|
|
const request = transport.request({
|
|
protocol: url.protocol,
|
|
hostname,
|
|
port: url.port || undefined,
|
|
method: "GET",
|
|
path: `${url.pathname}${url.search}`,
|
|
headers: options.headers,
|
|
servername: url.protocol === "https:" && !net.isIP(hostname) ? hostname : undefined,
|
|
lookup: (_hostname, _options, callback) => callback(null, address, net.isIP(address))
|
|
}, (response) => {
|
|
const chunks = [];
|
|
let size = 0;
|
|
response.on("data", (chunk) => {
|
|
size += chunk.length;
|
|
if (size > options.maxBytes) {
|
|
request.destroy(new Error("Provider response is too large."));
|
|
return;
|
|
}
|
|
chunks.push(chunk);
|
|
});
|
|
response.on("end", () => resolve({
|
|
ok: response.statusCode >= 200 && response.statusCode < 300,
|
|
status: response.statusCode,
|
|
headers: { get: (name) => response.headers[String(name).toLowerCase()] || null },
|
|
body: Buffer.concat(chunks)
|
|
}));
|
|
});
|
|
request.setTimeout(options.timeoutMs, () => {
|
|
const error = new Error("Search provider timed out.");
|
|
error.name = "AbortError";
|
|
request.destroy(error);
|
|
});
|
|
request.on("error", reject);
|
|
request.end();
|
|
});
|
|
}
|
|
|
|
function buildEndpoint(query, settings) {
|
|
if (!settings.provider_endpoint) throw new Error("Search provider endpoint is not configured.");
|
|
const endpoint = settings.provider_endpoint.includes("{query}")
|
|
? settings.provider_endpoint.replaceAll("{query}", encodeURIComponent(query))
|
|
: settings.provider_endpoint;
|
|
const url = new URL(endpoint);
|
|
if (!settings.provider_endpoint.includes("{query}")) {
|
|
url.searchParams.set(settings.provider_query_parameter || "q", query);
|
|
}
|
|
url.searchParams.set("format", "json");
|
|
url.searchParams.set("safesearch", safeSearchValue(settings.safe_search));
|
|
url.searchParams.set("count", String(settings.max_results));
|
|
if (settings.freshness) url.searchParams.set("time_range", String(settings.freshness).slice(0, 32));
|
|
return url.href;
|
|
}
|
|
|
|
function normalizeProviderResults(payload, adapter) {
|
|
const rows = adapter === "searxng_json"
|
|
? payload?.results
|
|
: payload?.results || payload?.items || payload?.web?.results?.value;
|
|
if (!Array.isArray(rows)) throw new Error("Search provider response does not contain a supported result list.");
|
|
return rows.map((row, index) => ({
|
|
title: sanitizeText(row.title || row.name || "Untitled result", 240),
|
|
url: String(row.url || row.link || ""),
|
|
snippet: sanitizeText(row.content || row.snippet || row.description || "", 800),
|
|
source_type: sanitizeText(row.source_type || row.category || row.engine || "", 80) || null,
|
|
date: normalizeDate(row.publishedDate || row.published_date || row.date),
|
|
relevance_score: finiteScore(row.score, index)
|
|
})).filter((row) => row.url);
|
|
}
|
|
|
|
async function readBounded(response, maxBytes) {
|
|
const declared = Number(response.headers.get("content-length"));
|
|
if (Number.isFinite(declared) && declared > maxBytes) throw new Error("Provider response is too large.");
|
|
const buffer = Buffer.from(await response.arrayBuffer());
|
|
if (buffer.length > maxBytes) throw new Error("Provider response is too large.");
|
|
return buffer;
|
|
}
|
|
|
|
function extractPageText(value) {
|
|
return sanitizeText(
|
|
String(value)
|
|
.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, " ")
|
|
.replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, " ")
|
|
.replace(/<[^>]+>/g, " "),
|
|
12000
|
|
);
|
|
}
|
|
|
|
function sanitizeText(value, maximum) {
|
|
return decodeEntities(String(value || "").replace(/<[^>]+>/g, " "))
|
|
.replace(/[\u0000-\u001f\u007f]/g, " ")
|
|
.replace(/\s+/g, " ")
|
|
.trim()
|
|
.slice(0, maximum);
|
|
}
|
|
|
|
function decodeEntities(value) {
|
|
return value
|
|
.replaceAll("&", "&")
|
|
.replaceAll("<", "<")
|
|
.replaceAll(">", ">")
|
|
.replaceAll(""", "\"")
|
|
.replaceAll("'", "'");
|
|
}
|
|
|
|
function normalizeDate(value) {
|
|
if (!value) return null;
|
|
const date = new Date(value);
|
|
return Number.isNaN(date.getTime()) ? null : date.toISOString();
|
|
}
|
|
|
|
function finiteScore(value, index) {
|
|
const number = Number(value);
|
|
return Number.isFinite(number) ? number : Math.max(0, 1 - index * 0.1);
|
|
}
|
|
|
|
function safeSearchValue(level) {
|
|
if (level === "off") return "0";
|
|
if (level === "moderate") return "1";
|
|
return "2";
|
|
}
|
|
|
|
function blockedError(reason) {
|
|
const error = new Error(`URL blocked by policy: ${reason}.`);
|
|
error.code = "URL_BLOCKED";
|
|
error.blockedReason = reason;
|
|
return error;
|
|
}
|
|
|
|
module.exports = {
|
|
MAX_PAGE_BYTES,
|
|
MAX_RESPONSE_BYTES,
|
|
SearchProvider,
|
|
blockedError,
|
|
buildEndpoint,
|
|
extractPageText,
|
|
normalizeProviderResults,
|
|
readBounded,
|
|
sanitizeText,
|
|
safeHttpRequest
|
|
};
|