247 lines
9.0 KiB
JavaScript
247 lines
9.0 KiB
JavaScript
const http = require("http");
|
|
const https = require("https");
|
|
const net = require("net");
|
|
const zlib = require("zlib");
|
|
const { extractPlainText, extractReadableHtml } = require("./html_extractor");
|
|
const {
|
|
defaultResolveHost,
|
|
evaluateNetworkTarget,
|
|
evaluateUrl,
|
|
isPrivateAddress
|
|
} = require("./url_policy");
|
|
|
|
const SUPPORTED_CONTENT_TYPES = Object.freeze([
|
|
"text/html",
|
|
"text/plain",
|
|
"application/xhtml+xml",
|
|
"application/xml",
|
|
"text/xml",
|
|
"application/rss+xml",
|
|
"application/atom+xml"
|
|
]);
|
|
|
|
class PageFetcher {
|
|
constructor(options = {}) {
|
|
this.fetch = options.fetch || null;
|
|
this.resolveHost = options.resolveHost || defaultResolveHost;
|
|
this.now = options.now || Date.now;
|
|
}
|
|
|
|
async fetchPage(value, settings, options = {}) {
|
|
const started = this.now();
|
|
let current = String(value || "");
|
|
const redirects = Math.max(0, Number(settings.max_redirects) || 3);
|
|
for (let count = 0; count <= redirects; count += 1) {
|
|
const policy = options.networkOnly
|
|
? await evaluateNetworkTarget(current, { resolveHost: this.resolveHost })
|
|
: await evaluateUrl(current, {
|
|
mode: settings.policy_mode,
|
|
rules: settings.url_rules,
|
|
resolveHost: this.resolveHost
|
|
});
|
|
if (!policy.allowed) throw blockedError(policy.reason);
|
|
const response = await this.request(policy.url, settings, options);
|
|
if (response.status >= 300 && response.status < 400) {
|
|
const location = response.headers.get("location");
|
|
if (!location) throw fetchError("redirect_missing_location", "Redirect did not include a location.");
|
|
current = new URL(location, policy.url).href;
|
|
continue;
|
|
}
|
|
if (!response.ok) {
|
|
throw fetchError("http_error", `Public page request failed (${response.status}).`);
|
|
}
|
|
const contentType = contentTypeBase(response.headers.get("content-type"));
|
|
if (!SUPPORTED_CONTENT_TYPES.includes(contentType)) {
|
|
throw fetchError("unsupported_content_type", `Unsupported content type: ${contentType || "unknown"}.`);
|
|
}
|
|
const body = decodeBody(response.body, response.headers.get("content-encoding"), settings.max_fetch_bytes);
|
|
const text = body.toString(detectCharset(response.headers.get("content-type")));
|
|
const extracted = contentType === "text/plain"
|
|
? extractPlainText(text, { maxChars: settings.max_extracted_chars })
|
|
: extractReadableHtml(text, { maxChars: settings.max_extracted_chars });
|
|
const finalPolicy = options.networkOnly
|
|
? await evaluateNetworkTarget(policy.url, { resolveHost: this.resolveHost })
|
|
: await evaluateUrl(policy.url, {
|
|
mode: settings.policy_mode,
|
|
rules: settings.url_rules,
|
|
resolveHost: this.resolveHost
|
|
});
|
|
if (!finalPolicy.allowed) throw blockedError(finalPolicy.reason);
|
|
return {
|
|
url: String(value),
|
|
final_url: finalPolicy.url,
|
|
title: extracted.title,
|
|
description: extracted.description,
|
|
headings: extracted.headings,
|
|
canonical_url: safeCanonical(extracted.canonical_url, finalPolicy.url),
|
|
published_at: extracted.published_at,
|
|
updated_at: extracted.updated_at,
|
|
extracted_text: extracted.extracted_text,
|
|
content_type: contentType,
|
|
fetched_at: new Date().toISOString(),
|
|
extraction_status: extracted.extraction_status,
|
|
timing_ms: Math.max(0, this.now() - started),
|
|
truncated: body.length >= settings.max_fetch_bytes
|
|
};
|
|
}
|
|
throw fetchError("redirect_limit", "Public page request exceeded the redirect limit.");
|
|
}
|
|
|
|
async request(url, settings, options = {}) {
|
|
const headers = {
|
|
Accept: options.accept || "text/html,text/plain,application/xhtml+xml,application/xml;q=0.8",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"User-Agent": "Lumi-Web-Search/1.1 (+https://git.rolfsvaag.no/Rolfsvaag_Datateknikk/Lumi)"
|
|
};
|
|
if (options.headers) Object.assign(headers, options.headers);
|
|
const timeoutMs = options.timeoutMs || settings.fetch_timeout_ms;
|
|
const maxBytes = options.maxBytes || settings.max_fetch_bytes;
|
|
if (this.fetch) {
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
timer.unref?.();
|
|
try {
|
|
const response = await this.fetch(url, {
|
|
method: "GET",
|
|
headers,
|
|
redirect: "manual",
|
|
signal: controller.signal
|
|
});
|
|
return {
|
|
ok: response.ok,
|
|
status: response.status,
|
|
headers: response.headers,
|
|
body: Buffer.isBuffer(response.body) || response.body instanceof Uint8Array
|
|
? Buffer.from(response.body)
|
|
: await readBounded(response, maxBytes)
|
|
};
|
|
} finally {
|
|
clearTimeout(timer);
|
|
}
|
|
}
|
|
return safeHttpRequest(url, {
|
|
headers,
|
|
timeoutMs,
|
|
maxBytes,
|
|
resolveHost: this.resolveHost
|
|
});
|
|
}
|
|
}
|
|
|
|
async function safeHttpRequest(value, options = {}) {
|
|
const url = new URL(value);
|
|
const hostname = url.hostname.replace(/^\[|\]$/g, "");
|
|
const addresses = await (options.resolveHost || defaultResolveHost)(hostname);
|
|
if (!addresses.length || addresses.some(isPrivateAddress)) throw blockedError("private_network");
|
|
const address = addresses[0];
|
|
const transport = url.protocol === "https:" ? https : http;
|
|
return new Promise((resolve, reject) => {
|
|
const request = transport.request({
|
|
protocol: url.protocol,
|
|
hostname,
|
|
port: url.port || undefined,
|
|
method: "GET",
|
|
path: `${url.pathname}${url.search}`,
|
|
headers: options.headers,
|
|
servername: url.protocol === "https:" && !net.isIP(hostname) ? hostname : undefined,
|
|
lookup: (_hostname, lookupOptions, callback) => {
|
|
const family = net.isIP(address);
|
|
if (lookupOptions?.all) callback(null, [{ address, family }]);
|
|
else callback(null, address, family);
|
|
}
|
|
}, (response) => {
|
|
const chunks = [];
|
|
let size = 0;
|
|
response.on("data", (chunk) => {
|
|
size += chunk.length;
|
|
if (size > options.maxBytes) {
|
|
request.destroy(fetchError("response_too_large", "Public response exceeded the size limit."));
|
|
return;
|
|
}
|
|
chunks.push(chunk);
|
|
});
|
|
response.on("end", () => resolve({
|
|
ok: response.statusCode >= 200 && response.statusCode < 300,
|
|
status: response.statusCode,
|
|
headers: { get: (name) => response.headers[String(name).toLowerCase()] || null },
|
|
body: Buffer.concat(chunks)
|
|
}));
|
|
});
|
|
request.setTimeout(options.timeoutMs, () => {
|
|
const error = fetchError("timeout", "Public request timed out.");
|
|
error.name = "AbortError";
|
|
request.destroy(error);
|
|
});
|
|
request.on("error", reject);
|
|
request.end();
|
|
});
|
|
}
|
|
|
|
async function readBounded(response, maximum) {
|
|
const declared = Number(response.headers.get("content-length"));
|
|
if (Number.isFinite(declared) && declared > maximum) {
|
|
throw fetchError("response_too_large", "Public response exceeded the size limit.");
|
|
}
|
|
const buffer = Buffer.from(await response.arrayBuffer());
|
|
if (buffer.length > maximum) throw fetchError("response_too_large", "Public response exceeded the size limit.");
|
|
return buffer;
|
|
}
|
|
|
|
function decodeBody(buffer, encoding, maximum) {
|
|
const normalized = String(encoding || "").toLowerCase().trim();
|
|
let output = buffer;
|
|
try {
|
|
if (normalized === "gzip") output = zlib.gunzipSync(buffer, { maxOutputLength: maximum });
|
|
else if (normalized === "deflate") output = zlib.inflateSync(buffer, { maxOutputLength: maximum });
|
|
else if (normalized === "br") output = zlib.brotliDecompressSync(buffer, { maxOutputLength: maximum });
|
|
else if (normalized && normalized !== "identity") {
|
|
throw fetchError("unsupported_encoding", "Unsupported response encoding.");
|
|
}
|
|
} catch (error) {
|
|
if (error.code) throw error;
|
|
throw fetchError("decompression_failed", "Response decompression failed or exceeded the size limit.");
|
|
}
|
|
if (output.length > maximum) throw fetchError("response_too_large", "Decompressed response exceeded the size limit.");
|
|
return output;
|
|
}
|
|
|
|
function contentTypeBase(value) {
|
|
return String(value || "").split(";")[0].trim().toLowerCase();
|
|
}
|
|
|
|
function detectCharset(contentType) {
|
|
const charset = String(contentType || "").match(/charset\s*=\s*["']?([^;"'\s]+)/i)?.[1]?.toLowerCase();
|
|
return ["utf8", "utf-8", "ascii", "latin1"].includes(charset) ? charset.replace("-", "") : "utf8";
|
|
}
|
|
|
|
function safeCanonical(value, baseUrl) {
|
|
if (!value) return null;
|
|
try {
|
|
const resolved = new URL(value, baseUrl);
|
|
return ["http:", "https:"].includes(resolved.protocol) ? resolved.href : null;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function blockedError(reason) {
|
|
const error = fetchError("URL_BLOCKED", `URL blocked by policy: ${reason}.`);
|
|
error.blockedReason = reason;
|
|
return error;
|
|
}
|
|
|
|
function fetchError(code, message) {
|
|
return Object.assign(new Error(message), { code });
|
|
}
|
|
|
|
module.exports = {
|
|
PageFetcher,
|
|
SUPPORTED_CONTENT_TYPES,
|
|
blockedError,
|
|
contentTypeBase,
|
|
decodeBody,
|
|
fetchError,
|
|
readBounded,
|
|
safeHttpRequest
|
|
};
|