Lumi/plugins/lumi_ai_web_search/backend/page_fetcher.js
2026-06-14 05:01:13 +02:00

247 lines
9.0 KiB
JavaScript

const http = require("http");
const https = require("https");
const net = require("net");
const zlib = require("zlib");
const { extractPlainText, extractReadableHtml } = require("./html_extractor");
const {
defaultResolveHost,
evaluateNetworkTarget,
evaluateUrl,
isPrivateAddress
} = require("./url_policy");
const SUPPORTED_CONTENT_TYPES = Object.freeze([
"text/html",
"text/plain",
"application/xhtml+xml",
"application/xml",
"text/xml",
"application/rss+xml",
"application/atom+xml"
]);
class PageFetcher {
constructor(options = {}) {
this.fetch = options.fetch || null;
this.resolveHost = options.resolveHost || defaultResolveHost;
this.now = options.now || Date.now;
}
async fetchPage(value, settings, options = {}) {
const started = this.now();
let current = String(value || "");
const redirects = Math.max(0, Number(settings.max_redirects) || 3);
for (let count = 0; count <= redirects; count += 1) {
const policy = options.networkOnly
? await evaluateNetworkTarget(current, { resolveHost: this.resolveHost })
: await evaluateUrl(current, {
mode: settings.policy_mode,
rules: settings.url_rules,
resolveHost: this.resolveHost
});
if (!policy.allowed) throw blockedError(policy.reason);
const response = await this.request(policy.url, settings, options);
if (response.status >= 300 && response.status < 400) {
const location = response.headers.get("location");
if (!location) throw fetchError("redirect_missing_location", "Redirect did not include a location.");
current = new URL(location, policy.url).href;
continue;
}
if (!response.ok) {
throw fetchError("http_error", `Public page request failed (${response.status}).`);
}
const contentType = contentTypeBase(response.headers.get("content-type"));
if (!SUPPORTED_CONTENT_TYPES.includes(contentType)) {
throw fetchError("unsupported_content_type", `Unsupported content type: ${contentType || "unknown"}.`);
}
const body = decodeBody(response.body, response.headers.get("content-encoding"), settings.max_fetch_bytes);
const text = body.toString(detectCharset(response.headers.get("content-type")));
const extracted = contentType === "text/plain"
? extractPlainText(text, { maxChars: settings.max_extracted_chars })
: extractReadableHtml(text, { maxChars: settings.max_extracted_chars });
const finalPolicy = options.networkOnly
? await evaluateNetworkTarget(policy.url, { resolveHost: this.resolveHost })
: await evaluateUrl(policy.url, {
mode: settings.policy_mode,
rules: settings.url_rules,
resolveHost: this.resolveHost
});
if (!finalPolicy.allowed) throw blockedError(finalPolicy.reason);
return {
url: String(value),
final_url: finalPolicy.url,
title: extracted.title,
description: extracted.description,
headings: extracted.headings,
canonical_url: safeCanonical(extracted.canonical_url, finalPolicy.url),
published_at: extracted.published_at,
updated_at: extracted.updated_at,
extracted_text: extracted.extracted_text,
content_type: contentType,
fetched_at: new Date().toISOString(),
extraction_status: extracted.extraction_status,
timing_ms: Math.max(0, this.now() - started),
truncated: body.length >= settings.max_fetch_bytes
};
}
throw fetchError("redirect_limit", "Public page request exceeded the redirect limit.");
}
async request(url, settings, options = {}) {
const headers = {
Accept: options.accept || "text/html,text/plain,application/xhtml+xml,application/xml;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"User-Agent": "Lumi-Web-Search/1.1 (+https://git.rolfsvaag.no/Rolfsvaag_Datateknikk/Lumi)"
};
if (options.headers) Object.assign(headers, options.headers);
const timeoutMs = options.timeoutMs || settings.fetch_timeout_ms;
const maxBytes = options.maxBytes || settings.max_fetch_bytes;
if (this.fetch) {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs);
timer.unref?.();
try {
const response = await this.fetch(url, {
method: "GET",
headers,
redirect: "manual",
signal: controller.signal
});
return {
ok: response.ok,
status: response.status,
headers: response.headers,
body: Buffer.isBuffer(response.body) || response.body instanceof Uint8Array
? Buffer.from(response.body)
: await readBounded(response, maxBytes)
};
} finally {
clearTimeout(timer);
}
}
return safeHttpRequest(url, {
headers,
timeoutMs,
maxBytes,
resolveHost: this.resolveHost
});
}
}
async function safeHttpRequest(value, options = {}) {
const url = new URL(value);
const hostname = url.hostname.replace(/^\[|\]$/g, "");
const addresses = await (options.resolveHost || defaultResolveHost)(hostname);
if (!addresses.length || addresses.some(isPrivateAddress)) throw blockedError("private_network");
const address = addresses[0];
const transport = url.protocol === "https:" ? https : http;
return new Promise((resolve, reject) => {
const request = transport.request({
protocol: url.protocol,
hostname,
port: url.port || undefined,
method: "GET",
path: `${url.pathname}${url.search}`,
headers: options.headers,
servername: url.protocol === "https:" && !net.isIP(hostname) ? hostname : undefined,
lookup: (_hostname, lookupOptions, callback) => {
const family = net.isIP(address);
if (lookupOptions?.all) callback(null, [{ address, family }]);
else callback(null, address, family);
}
}, (response) => {
const chunks = [];
let size = 0;
response.on("data", (chunk) => {
size += chunk.length;
if (size > options.maxBytes) {
request.destroy(fetchError("response_too_large", "Public response exceeded the size limit."));
return;
}
chunks.push(chunk);
});
response.on("end", () => resolve({
ok: response.statusCode >= 200 && response.statusCode < 300,
status: response.statusCode,
headers: { get: (name) => response.headers[String(name).toLowerCase()] || null },
body: Buffer.concat(chunks)
}));
});
request.setTimeout(options.timeoutMs, () => {
const error = fetchError("timeout", "Public request timed out.");
error.name = "AbortError";
request.destroy(error);
});
request.on("error", reject);
request.end();
});
}
async function readBounded(response, maximum) {
const declared = Number(response.headers.get("content-length"));
if (Number.isFinite(declared) && declared > maximum) {
throw fetchError("response_too_large", "Public response exceeded the size limit.");
}
const buffer = Buffer.from(await response.arrayBuffer());
if (buffer.length > maximum) throw fetchError("response_too_large", "Public response exceeded the size limit.");
return buffer;
}
function decodeBody(buffer, encoding, maximum) {
const normalized = String(encoding || "").toLowerCase().trim();
let output = buffer;
try {
if (normalized === "gzip") output = zlib.gunzipSync(buffer, { maxOutputLength: maximum });
else if (normalized === "deflate") output = zlib.inflateSync(buffer, { maxOutputLength: maximum });
else if (normalized === "br") output = zlib.brotliDecompressSync(buffer, { maxOutputLength: maximum });
else if (normalized && normalized !== "identity") {
throw fetchError("unsupported_encoding", "Unsupported response encoding.");
}
} catch (error) {
if (error.code) throw error;
throw fetchError("decompression_failed", "Response decompression failed or exceeded the size limit.");
}
if (output.length > maximum) throw fetchError("response_too_large", "Decompressed response exceeded the size limit.");
return output;
}
function contentTypeBase(value) {
return String(value || "").split(";")[0].trim().toLowerCase();
}
function detectCharset(contentType) {
const charset = String(contentType || "").match(/charset\s*=\s*["']?([^;"'\s]+)/i)?.[1]?.toLowerCase();
return ["utf8", "utf-8", "ascii", "latin1"].includes(charset) ? charset.replace("-", "") : "utf8";
}
function safeCanonical(value, baseUrl) {
if (!value) return null;
try {
const resolved = new URL(value, baseUrl);
return ["http:", "https:"].includes(resolved.protocol) ? resolved.href : null;
} catch {
return null;
}
}
function blockedError(reason) {
const error = fetchError("URL_BLOCKED", `URL blocked by policy: ${reason}.`);
error.blockedReason = reason;
return error;
}
function fetchError(code, message) {
return Object.assign(new Error(message), { code });
}
module.exports = {
PageFetcher,
SUPPORTED_CONTENT_TYPES,
blockedError,
contentTypeBase,
decodeBody,
fetchError,
readBounded,
safeHttpRequest
};