const http = require("http"); const https = require("https"); const net = require("net"); const zlib = require("zlib"); const { extractPlainText, extractReadableHtml } = require("./html_extractor"); const { defaultResolveHost, evaluateNetworkTarget, evaluateUrl, isPrivateAddress } = require("./url_policy"); const SUPPORTED_CONTENT_TYPES = Object.freeze([ "text/html", "text/plain", "application/xhtml+xml", "application/xml", "text/xml", "application/rss+xml", "application/atom+xml" ]); class PageFetcher { constructor(options = {}) { this.fetch = options.fetch || null; this.resolveHost = options.resolveHost || defaultResolveHost; this.now = options.now || Date.now; } async fetchPage(value, settings, options = {}) { const started = this.now(); let current = String(value || ""); const redirects = Math.max(0, Number(settings.max_redirects) || 3); for (let count = 0; count <= redirects; count += 1) { const policy = options.networkOnly ? await evaluateNetworkTarget(current, { resolveHost: this.resolveHost }) : await evaluateUrl(current, { mode: settings.policy_mode, rules: settings.url_rules, resolveHost: this.resolveHost }); if (!policy.allowed) throw blockedError(policy.reason); const response = await this.request(policy.url, settings, options); if (response.status >= 300 && response.status < 400) { const location = response.headers.get("location"); if (!location) throw fetchError("redirect_missing_location", "Redirect did not include a location."); current = new URL(location, policy.url).href; continue; } if (!response.ok) { throw fetchError("http_error", `Public page request failed (${response.status}).`); } const contentType = contentTypeBase(response.headers.get("content-type")); if (!SUPPORTED_CONTENT_TYPES.includes(contentType)) { throw fetchError("unsupported_content_type", `Unsupported content type: ${contentType || "unknown"}.`); } const body = decodeBody(response.body, response.headers.get("content-encoding"), settings.max_fetch_bytes); const text = body.toString(detectCharset(response.headers.get("content-type"))); const extracted = contentType === "text/plain" ? extractPlainText(text, { maxChars: settings.max_extracted_chars }) : extractReadableHtml(text, { maxChars: settings.max_extracted_chars }); const finalPolicy = options.networkOnly ? await evaluateNetworkTarget(policy.url, { resolveHost: this.resolveHost }) : await evaluateUrl(policy.url, { mode: settings.policy_mode, rules: settings.url_rules, resolveHost: this.resolveHost }); if (!finalPolicy.allowed) throw blockedError(finalPolicy.reason); return { url: String(value), final_url: finalPolicy.url, title: extracted.title, description: extracted.description, headings: extracted.headings, canonical_url: safeCanonical(extracted.canonical_url, finalPolicy.url), published_at: extracted.published_at, updated_at: extracted.updated_at, extracted_text: extracted.extracted_text, content_type: contentType, fetched_at: new Date().toISOString(), extraction_status: extracted.extraction_status, timing_ms: Math.max(0, this.now() - started), truncated: body.length >= settings.max_fetch_bytes }; } throw fetchError("redirect_limit", "Public page request exceeded the redirect limit."); } async request(url, settings, options = {}) { const headers = { Accept: options.accept || "text/html,text/plain,application/xhtml+xml,application/xml;q=0.8", "Accept-Encoding": "gzip, deflate, br", "User-Agent": "Lumi-Web-Search/1.1 (+https://git.rolfsvaag.no/Rolfsvaag_Datateknikk/Lumi)" }; if (options.headers) Object.assign(headers, options.headers); const timeoutMs = options.timeoutMs || settings.fetch_timeout_ms; const maxBytes = options.maxBytes || settings.max_fetch_bytes; if (this.fetch) { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutMs); timer.unref?.(); try { const response = await this.fetch(url, { method: "GET", headers, redirect: "manual", signal: controller.signal }); return { ok: response.ok, status: response.status, headers: response.headers, body: Buffer.isBuffer(response.body) || response.body instanceof Uint8Array ? Buffer.from(response.body) : await readBounded(response, maxBytes) }; } finally { clearTimeout(timer); } } return safeHttpRequest(url, { headers, timeoutMs, maxBytes, resolveHost: this.resolveHost }); } } async function safeHttpRequest(value, options = {}) { const url = new URL(value); const hostname = url.hostname.replace(/^\[|\]$/g, ""); const addresses = await (options.resolveHost || defaultResolveHost)(hostname); if (!addresses.length || addresses.some(isPrivateAddress)) throw blockedError("private_network"); const address = addresses[0]; const transport = url.protocol === "https:" ? https : http; return new Promise((resolve, reject) => { const request = transport.request({ protocol: url.protocol, hostname, port: url.port || undefined, method: "GET", path: `${url.pathname}${url.search}`, headers: options.headers, servername: url.protocol === "https:" && !net.isIP(hostname) ? hostname : undefined, lookup: (_hostname, lookupOptions, callback) => { const family = net.isIP(address); if (lookupOptions?.all) callback(null, [{ address, family }]); else callback(null, address, family); } }, (response) => { const chunks = []; let size = 0; response.on("data", (chunk) => { size += chunk.length; if (size > options.maxBytes) { request.destroy(fetchError("response_too_large", "Public response exceeded the size limit.")); return; } chunks.push(chunk); }); response.on("end", () => resolve({ ok: response.statusCode >= 200 && response.statusCode < 300, status: response.statusCode, headers: { get: (name) => response.headers[String(name).toLowerCase()] || null }, body: Buffer.concat(chunks) })); }); request.setTimeout(options.timeoutMs, () => { const error = fetchError("timeout", "Public request timed out."); error.name = "AbortError"; request.destroy(error); }); request.on("error", reject); request.end(); }); } async function readBounded(response, maximum) { const declared = Number(response.headers.get("content-length")); if (Number.isFinite(declared) && declared > maximum) { throw fetchError("response_too_large", "Public response exceeded the size limit."); } const buffer = Buffer.from(await response.arrayBuffer()); if (buffer.length > maximum) throw fetchError("response_too_large", "Public response exceeded the size limit."); return buffer; } function decodeBody(buffer, encoding, maximum) { const normalized = String(encoding || "").toLowerCase().trim(); let output = buffer; try { if (normalized === "gzip") output = zlib.gunzipSync(buffer, { maxOutputLength: maximum }); else if (normalized === "deflate") output = zlib.inflateSync(buffer, { maxOutputLength: maximum }); else if (normalized === "br") output = zlib.brotliDecompressSync(buffer, { maxOutputLength: maximum }); else if (normalized && normalized !== "identity") { throw fetchError("unsupported_encoding", "Unsupported response encoding."); } } catch (error) { if (error.code) throw error; throw fetchError("decompression_failed", "Response decompression failed or exceeded the size limit."); } if (output.length > maximum) throw fetchError("response_too_large", "Decompressed response exceeded the size limit."); return output; } function contentTypeBase(value) { return String(value || "").split(";")[0].trim().toLowerCase(); } function detectCharset(contentType) { const charset = String(contentType || "").match(/charset\s*=\s*["']?([^;"'\s]+)/i)?.[1]?.toLowerCase(); return ["utf8", "utf-8", "ascii", "latin1"].includes(charset) ? charset.replace("-", "") : "utf8"; } function safeCanonical(value, baseUrl) { if (!value) return null; try { const resolved = new URL(value, baseUrl); return ["http:", "https:"].includes(resolved.protocol) ? resolved.href : null; } catch { return null; } } function blockedError(reason) { const error = fetchError("URL_BLOCKED", `URL blocked by policy: ${reason}.`); error.blockedReason = reason; return error; } function fetchError(code, message) { return Object.assign(new Error(message), { code }); } module.exports = { PageFetcher, SUPPORTED_CONTENT_TYPES, blockedError, contentTypeBase, decodeBody, fetchError, readBounded, safeHttpRequest };