Lumi/plugins/okf/backend/file_knowledge.js
2026-06-25 14:10:04 +02:00

675 lines
23 KiB
JavaScript

const fs = require("fs");
const path = require("path");
const KNOWLEDGE_SCOPES = Object.freeze(["corrections", "community", "plugins", "core"]);
const SCOPE_PRIORITY = Object.freeze({
corrections: 400,
community: 300,
plugins: 200,
core: 100
});
const ACTIVE_STATUSES = new Set(["active", "published", "approved"]);
const HIDDEN_STATUSES = new Set(["archived", "deleted", "disabled", "draft"]);
const VISIBILITY_VALUES = new Set(["user", "mod", "admin"]);
const PLACEHOLDER_SUGGEST_RESERVED_KEYS = new Set([
"id",
"title",
"scope",
"status",
"priority",
"visibility",
"category",
"tags",
"generated",
"editable",
"created_at",
"updated_at"
]);
const SEARCH_STOPWORDS = new Set([
"a", "an", "and", "are", "about", "describe", "for", "identify", "is", "me",
"of", "please", "tell", "the", "to", "what", "who", "was", "were"
]);
const knowledgeIndexCache = new Map();
function knowledgeRoot(rootDir = process.cwd()) {
return path.join(rootDir, "knowledge");
}
function ensureKnowledgeDirs(rootDir = process.cwd()) {
const root = knowledgeRoot(rootDir);
for (const scope of KNOWLEDGE_SCOPES) {
fs.mkdirSync(path.join(root, scope), { recursive: true });
}
return root;
}
function migrateSingleBracePlaceholders(rootDir = process.cwd()) {
const root = ensureKnowledgeDirs(rootDir);
let changed = 0;
for (const filePath of listKnowledgeFiles(rootDir)) {
const content = fs.readFileSync(filePath, "utf8");
const next = content.replace(/(^|[^\{])\{([A-Za-z0-9_.-]+\.[A-Za-z0-9_.-]+)\}(?!\})/g, (match, prefix, key) => {
return `${prefix}{{${key}}}`;
});
if (next !== content) {
fs.writeFileSync(filePath, next);
changed += 1;
}
}
if (changed) knowledgeIndexCache.clear();
return { root, changed };
}
function listKnowledgeFiles(rootDir = process.cwd()) {
const root = knowledgeRoot(rootDir);
const files = [];
for (const scope of KNOWLEDGE_SCOPES) {
const scopeRoot = path.join(root, scope);
if (!fs.existsSync(scopeRoot)) continue;
scanMarkdownFiles(scopeRoot, files);
}
return files.sort();
}
function loadKnowledgeEntries(rootDir = process.cwd(), options = {}) {
const root = knowledgeRoot(rootDir);
const files = listKnowledgeFiles(rootDir);
const cacheKey = `${path.resolve(root)}:${options.includeHidden ? "all" : "active"}`;
const previous = knowledgeIndexCache.get(cacheKey) || { files: new Map(), entries: [] };
const nextFiles = new Map();
const entries = [];
for (const filePath of files) {
const stat = fs.statSync(filePath);
const previousFile = previous.files.get(filePath);
const cacheMeta = {
mtimeMs: stat.mtimeMs,
size: stat.size
};
const parsed = previousFile && previousFile.mtimeMs === cacheMeta.mtimeMs && previousFile.size === cacheMeta.size
? previousFile.entry
: parseKnowledgeFile(filePath, root, options);
nextFiles.set(filePath, {
...cacheMeta,
entry: parsed
});
if (parsed) entries.push(parsed);
}
knowledgeIndexCache.set(cacheKey, { files: nextFiles, entries });
return entries.map(cloneKnowledgeEntry);
}
function searchFileKnowledge({ query = "", user, limit = 5, rootDir = process.cwd() } = {}) {
const access = accessForUser(user);
if (!access.authenticated) return [];
const tokens = tokenSet(query);
const entries = resolveVisibleKnowledgePlaceholders(loadKnowledgeEntries(rootDir)
.filter((entry) => canSeeKnowledgeEntry(entry, access))
);
return entries
.flatMap((entry) => entry.chunks.map((chunk) => scoreKnowledgeChunk(entry, chunk, tokens)))
.filter((result) => result.matched || !tokens.size)
.sort((a, b) => b.score - a.score || b.priority - a.priority || a.source_metadata.path.localeCompare(b.source_metadata.path))
.slice(0, Math.max(1, Math.min(Number(limit) || 5, 25)))
.map((result) => ({
id: result.id,
slug: result.slug,
title: result.title,
category: result.category,
visibility: result.visibility,
summary: result.summary,
facts: result.facts,
source: result.source,
source_metadata: result.source_metadata
}));
}
function listKnowledgePlaceholders({ user, rootDir = process.cwd(), includeHidden = false } = {}) {
const access = accessForUser(user);
if (!access.authenticated) return [];
const entries = loadKnowledgeEntries(rootDir, { includeHidden })
.filter((entry) => canSeeKnowledgeEntry(entry, access));
const placeholders = new Set();
for (const entry of entries) {
for (const key of Object.keys(entry.frontmatter || {})) {
if (PLACEHOLDER_SUGGEST_RESERVED_KEYS.has(key)) continue;
const value = placeholderValue(entry.frontmatter[key]);
if (!value) continue;
placeholders.add(`${entry.id}.${key}`);
placeholders.add(`${entry.scope}.${entry.slug}.${key}`);
}
}
return Array.from(placeholders)
.sort((a, b) => a.localeCompare(b))
.map((key) => `{{${key}}}`);
}
function registerKnowledgePlaceholderDefinitions(placeholders, { rootDir = process.cwd() } = {}) {
if (!placeholders?.registerPlaceholders) return [];
placeholders.unregisterNamespace?.("okf.file");
const definitions = [];
const entries = loadKnowledgeEntries(rootDir)
.filter((entry) => !HIDDEN_STATUSES.has(entry.status));
for (const entry of entries) {
for (const key of Object.keys(entry.frontmatter || {})) {
if (PLACEHOLDER_SUGGEST_RESERVED_KEYS.has(key)) continue;
const value = placeholderValue(entry.frontmatter[key]);
if (!value) continue;
definitions.push({
id: `okf.file.${entry.scope}.${entry.slug}.${key}`,
namespace: `okf.file.${entry.scope}`,
aliases: [
`${entry.id}.${key}`,
`${entry.scope}.${entry.slug}.${key}`
],
label: `${entry.title}: ${key.replace(/[_-]+/g, " ")}`,
description: `Frontmatter value from ${entry.title}.`,
value_type: "string",
sensitivity: visibilitySensitivity(entry.visibility),
min_editor_role: "user",
min_viewer_role: visibilityRole(entry.visibility),
allowed_field_types: ["okf_markdown"],
group: `OKF ${entry.scope}`,
resolver: () => value
});
}
}
return placeholders.registerPlaceholders(definitions);
}
function parseKnowledgeFile(filePath, root, options = {}) {
const content = fs.readFileSync(filePath, "utf8");
const relativePath = normalizePath(path.relative(root, filePath));
const scope = relativePath.split("/")[0] || "core";
const parsed = splitFrontmatter(content);
const metadata = normalizeMetadata(parsed.frontmatter, filePath, relativePath, scope);
if (!options.includeHidden && HIDDEN_STATUSES.has(metadata.status)) return null;
const body = cleanText(parsed.body, 180000);
return {
...metadata,
path: relativePath,
file_slug: slugify(path.basename(relativePath, path.extname(relativePath))),
body,
chunks: chunkMarkdown(body, metadata).map((chunk) => ({
...chunk,
path: relativePath
}))
};
}
function listCommunityKnowledgeFiles(rootDir = process.cwd()) {
return loadKnowledgeEntries(rootDir, { includeHidden: true })
.filter((entry) => entry.scope === "community")
.map((entry) => ({
id: entry.id,
slug: entry.file_slug,
entry_slug: entry.slug,
title: entry.title,
status: entry.status,
visibility: entry.visibility,
priority: entry.priority,
tags: entry.tags,
editable: entry.editable,
generated: entry.generated,
path: `knowledge/${entry.path}`,
updated_at: entry.updated_at
}))
.sort((a, b) => a.title.localeCompare(b.title));
}
function getCommunityKnowledgeFile(rootDir = process.cwd(), slug) {
const entry = loadKnowledgeEntries(rootDir, { includeHidden: true })
.find((item) => item.scope === "community" && (item.file_slug === slug || item.slug === slug || item.id === slug));
return entry ? {
...entry,
slug: entry.file_slug,
entry_slug: entry.slug,
path: `knowledge/${entry.path}`
} : null;
}
function saveCommunityKnowledgeFile(rootDir = process.cwd(), values = {}) {
const root = ensureKnowledgeDirs(rootDir);
const slug = slugify(values.slug || values.id || values.title);
const existing = values.existing_slug ? getCommunityKnowledgeFile(rootDir, values.existing_slug) : null;
if (existing && (existing.generated || !existing.editable)) {
throw new Error("This community OKF file is not editable.");
}
const filePath = path.join(root, "community", `${slug}.md`);
if (existing && path.resolve(filePath) !== path.resolve(rootDir, existing.path)) {
const existingPath = path.resolve(rootDir, existing.path);
if (fs.existsSync(existingPath)) fs.rmSync(existingPath, { force: true });
}
const metadata = normalizeCommunityFileValues(values, slug);
const markdown = serializeKnowledgeFile(metadata, values.body || "");
fs.writeFileSync(filePath, markdown);
knowledgeIndexCache.clear();
parseKnowledgeFile(filePath, root, { includeHidden: true });
return getCommunityKnowledgeFile(rootDir, slug);
}
function saveCorrectionKnowledgeFile(rootDir = process.cwd(), values = {}) {
const root = ensureKnowledgeDirs(rootDir);
const slug = slugify(values.slug || values.id || values.title);
const filePath = path.join(root, "corrections", `${slug}.md`);
const metadata = normalizeCorrectionFileValues(values, slug);
const markdown = serializeKnowledgeFile(metadata, values.body || "");
fs.writeFileSync(filePath, markdown);
knowledgeIndexCache.clear();
const entry = parseKnowledgeFile(filePath, root, { includeHidden: true });
return entry ? {
...entry,
slug: entry.file_slug,
entry_slug: entry.slug,
path: `knowledge/${entry.path}`
} : null;
}
function cloneKnowledgeEntry(entry) {
return {
...entry,
tags: [...entry.tags],
frontmatter: { ...entry.frontmatter },
chunks: entry.chunks.map((chunk) => ({ ...chunk }))
};
}
function resolveVisibleKnowledgePlaceholders(entries) {
const placeholders = buildPlaceholderMap(entries);
return entries.map((entry) => {
const body = resolvePlaceholders(entry.body, placeholders);
return {
...entry,
body,
chunks: chunkMarkdown(body, entry).map((chunk) => ({
...chunk,
path: entry.path
}))
};
});
}
function buildPlaceholderMap(entries) {
const map = new Map();
for (const entry of entries) {
const values = {
...entry.frontmatter,
id: entry.id,
title: entry.title,
scope: entry.scope,
status: entry.status,
priority: entry.priority,
visibility: entry.visibility,
category: entry.category,
tags: entry.tags,
generated: entry.generated,
editable: entry.editable,
created_at: entry.created_at,
updated_at: entry.updated_at
};
for (const [key, value] of Object.entries(values)) {
const normalizedValue = placeholderValue(value);
if (!normalizedValue) continue;
map.set(`${entry.id}.${key}`, normalizedValue);
map.set(`${entry.scope}.${entry.slug}.${key}`, normalizedValue);
}
}
return map;
}
function resolvePlaceholders(value, placeholders) {
let output = String(value || "");
for (let pass = 0; pass < 3; pass += 1) {
const next = output.replace(/\{\{\s*([A-Za-z0-9_.-]+)\s*\}\}/g, (match, key) => {
const replacement = placeholders.get(key);
return replacement === undefined ? "[missing OKF reference]" : replacement;
});
if (next === output) break;
output = next;
}
return output;
}
function splitFrontmatter(content) {
const normalized = String(content || "").replace(/\r\n?/g, "\n");
if (!normalized.startsWith("---\n")) {
return { frontmatter: {}, body: normalized };
}
const end = normalized.indexOf("\n---", 4);
if (end === -1) {
return { frontmatter: {}, body: normalized };
}
const rawFrontmatter = normalized.slice(4, end);
const body = normalized.slice(end + 4).replace(/^\n/, "");
return {
frontmatter: parseFrontmatter(rawFrontmatter),
body
};
}
function parseFrontmatter(raw) {
const out = {};
for (const line of String(raw || "").split("\n")) {
if (!line.trim() || line.trim().startsWith("#")) continue;
const match = line.match(/^([A-Za-z0-9_-]+)\s*:\s*(.*)$/);
if (!match) continue;
out[match[1].trim()] = parseFrontmatterValue(match[2].trim());
}
return out;
}
function parseFrontmatterValue(value) {
const unquoted = value.replace(/^["']|["']$/g, "");
if (/^(true|false)$/i.test(unquoted)) return unquoted.toLowerCase() === "true";
if (/^-?\d+(?:\.\d+)?$/.test(unquoted)) return Number(unquoted);
if (unquoted.startsWith("[") && unquoted.endsWith("]")) {
return unquoted
.slice(1, -1)
.split(",")
.map((item) => cleanText(item.replace(/^["']|["']$/g, ""), 120))
.filter(Boolean);
}
return unquoted;
}
function normalizeCommunityFileValues(values, slug) {
const id = cleanText(values.id, 180) || `community.${slug}`;
const title = cleanText(values.title, 180);
if (!title) throw new Error("Community OKF title is required.");
const status = cleanText(values.status, 40).toLowerCase() || "active";
const visibility = cleanText(values.visibility, 20).toLowerCase() || "user";
return {
id,
title,
scope: "community",
status,
priority: Number.isFinite(Number(values.priority)) ? Number(values.priority) : 0,
visibility: VISIBILITY_VALUES.has(visibility) ? visibility : "user",
category: cleanText(values.category || "Community", 120),
tags: splitList(values.tags),
generated: false,
editable: true,
created_at: cleanText(values.created_at, 80) || new Date().toISOString(),
updated_at: new Date().toISOString()
};
}
function normalizeCorrectionFileValues(values, slug) {
const id = cleanText(values.id, 180) || `correction.${slug}`;
const title = cleanText(values.title, 180);
if (!title) throw new Error("Correction OKF title is required.");
const status = cleanText(values.status, 40).toLowerCase() || "active";
const visibility = cleanText(values.visibility, 20).toLowerCase() || "user";
return {
id,
title,
scope: "corrections",
status,
priority: Number.isFinite(Number(values.priority)) ? Number(values.priority) : 100,
visibility: VISIBILITY_VALUES.has(visibility) ? visibility : "user",
category: cleanText(values.category || "Correction", 120),
tags: splitList(values.tags || "feedback, correction"),
generated: false,
editable: true,
created_at: cleanText(values.created_at, 80) || new Date().toISOString(),
updated_at: new Date().toISOString(),
extra_frontmatter: {
source_feedback_id: cleanText(values.source_feedback_id, 180),
source_feedback_url: cleanText(values.source_feedback_url, 1000)
}
};
}
function serializeKnowledgeFile(metadata, body) {
const frontmatter = {
id: metadata.id,
title: metadata.title,
scope: metadata.scope,
status: metadata.status,
priority: metadata.priority,
visibility: metadata.visibility,
category: metadata.category,
tags: metadata.tags.join(", "),
generated: metadata.generated,
editable: metadata.editable,
created_at: metadata.created_at,
updated_at: metadata.updated_at,
...(metadata.extra_frontmatter || {})
};
const lines = ["---"];
for (const [key, value] of Object.entries(frontmatter)) {
if (value === "" || value === null || value === undefined) continue;
lines.push(`${key}: ${frontmatterValue(value)}`);
}
lines.push("---", "", cleanText(body, 64000) || `# ${metadata.title}`, "");
return lines.join("\n");
}
function frontmatterValue(value) {
if (typeof value === "boolean" || typeof value === "number") return String(value);
const text = Array.isArray(value) ? value.join(", ") : String(value);
return /[:#[\]{}"'\\]|^\s|\s$/.test(text) ? JSON.stringify(text) : text;
}
function normalizeMetadata(frontmatter, filePath, relativePath, scope) {
const id = cleanText(frontmatter.id, 180) || normalizePath(relativePath).replace(/\.md$/i, "").replace(/\//g, ".");
const title = cleanText(frontmatter.title, 180) || titleFromPath(filePath);
const status = cleanText(frontmatter.status, 40).toLowerCase() || "active";
const visibility = VISIBILITY_VALUES.has(cleanText(frontmatter.visibility, 20).toLowerCase())
? cleanText(frontmatter.visibility, 20).toLowerCase()
: "user";
return {
id,
slug: slugify(id),
title,
scope: KNOWLEDGE_SCOPES.includes(scope) ? scope : "core",
status,
priority: Number.isFinite(Number(frontmatter.priority)) ? Number(frontmatter.priority) : 0,
visibility,
category: cleanText(frontmatter.category || frontmatter.scope || scope, 120),
tags: splitList(frontmatter.tags),
generated: Boolean(frontmatter.generated),
editable: frontmatter.editable === undefined ? scope === "community" || scope === "corrections" : Boolean(frontmatter.editable),
created_at: cleanText(frontmatter.created_at, 80),
updated_at: cleanText(frontmatter.updated_at, 80),
frontmatter: normalizeFrontmatterValues(frontmatter)
};
}
function chunkMarkdown(body, metadata) {
const lines = String(body || "").split("\n");
const chunks = [];
let current = { heading: metadata.title, level: 1, lines: [] };
const flush = () => {
const text = cleanText(current.lines.join("\n"), 8000);
if (!text) return;
chunks.push({
id: `${metadata.id}#${slugify(current.heading || "section")}`,
heading: cleanText(current.heading || metadata.title, 180),
level: current.level,
text
});
};
for (const line of lines) {
const heading = line.match(/^(#{1,6})\s+(.+)$/);
if (heading) {
flush();
current = {
heading: cleanText(heading[2], 180),
level: heading[1].length,
lines: []
};
continue;
}
current.lines.push(line);
}
flush();
if (!chunks.length && body.trim()) {
chunks.push({
id: `${metadata.id}#body`,
heading: metadata.title,
level: 1,
text: cleanText(body, 8000)
});
}
return chunks;
}
function scoreKnowledgeChunk(entry, chunk, queryTokens) {
const text = [entry.title, entry.category, entry.tags.join(" "), chunk.heading, chunk.text].join(" ");
const textTokens = tokenSet(text);
const overlap = queryTokens.size ? intersectionSize(queryTokens, textTokens) : 1;
const score = (overlap * 100) + SCOPE_PRIORITY[entry.scope] + Number(entry.priority || 0);
const excerpt = excerptForChunk(chunk.text, queryTokens);
return {
id: entry.id,
slug: entry.slug,
title: entry.title,
category: entry.category,
visibility: entry.visibility,
summary: excerpt,
facts: chunk.text.slice(0, 4000),
priority: SCOPE_PRIORITY[entry.scope] + Number(entry.priority || 0),
score,
matched: overlap > 0,
source: `knowledge/${chunk.path}${chunk.heading ? `#${slugify(chunk.heading)}` : ""}`,
source_metadata: {
path: `knowledge/${chunk.path}`,
id: entry.id,
heading: chunk.heading,
score,
excerpt
}
};
}
function canSeeKnowledgeEntry(entry, access) {
if (!entry) return false;
if (!ACTIVE_STATUSES.has(entry.status) && !access.isAdmin) return false;
if (entry.visibility === "admin") return access.isAdmin;
if (entry.visibility === "mod") return access.isMod || access.isAdmin;
return true;
}
function accessForUser(user) {
return {
authenticated: Boolean(user),
isAdmin: Boolean(user?.isAdmin),
isMod: Boolean(user?.isAdmin || user?.isMod)
};
}
function scanMarkdownFiles(dir, output) {
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
if (entry.name.startsWith(".")) continue;
const fullPath = path.join(dir, entry.name);
if (entry.isDirectory()) {
scanMarkdownFiles(fullPath, output);
} else if (entry.isFile() && entry.name.toLowerCase().endsWith(".md")) {
output.push(fullPath);
}
}
}
function excerptForChunk(text, queryTokens) {
const cleaned = cleanText(text, 1200);
if (!queryTokens.size) return cleaned.slice(0, 360);
const lower = cleaned.toLowerCase();
const token = Array.from(queryTokens).find((item) => lower.includes(item));
if (!token) return cleaned.slice(0, 360);
const index = Math.max(0, lower.indexOf(token) - 120);
return cleaned.slice(index, index + 420).trim();
}
function tokenSet(value) {
const cleaned = cleanText(value, 4000);
const expanded = cleaned.replace(/([a-z0-9])([A-Z])/g, "$1 $2");
const tokens = new Set();
for (const token of `${cleaned} ${expanded}`.toLowerCase().split(/[^a-z0-9_]+/)) {
if (token.length < 2 || SEARCH_STOPWORDS.has(token)) continue;
tokens.add(token);
}
return tokens;
}
function intersectionSize(a, b) {
let count = 0;
for (const item of a) {
if (b.has(item)) count += 1;
}
return count;
}
function normalizeFrontmatterValues(frontmatter = {}) {
const out = {};
for (const [key, value] of Object.entries(frontmatter)) {
const normalizedKey = cleanText(key, 80).replace(/[^A-Za-z0-9_-]/g, "_");
const normalizedValue = placeholderValue(value);
if (normalizedKey && normalizedValue) out[normalizedKey] = normalizedValue;
}
return out;
}
function placeholderValue(value) {
if (Array.isArray(value)) {
return value.map((item) => cleanText(item, 240)).filter(Boolean).join(", ");
}
if (typeof value === "boolean") return value ? "true" : "false";
if (typeof value === "number" && Number.isFinite(value)) return String(value);
return cleanText(value, 1000);
}
function visibilityRole(value) {
const visibility = VISIBILITY_VALUES.has(value) ? value : "user";
return visibility === "admin" ? "admin" : visibility === "mod" ? "mod" : "user";
}
function visibilitySensitivity(value) {
const visibility = VISIBILITY_VALUES.has(value) ? value : "user";
return visibility === "admin" ? "admin" : visibility === "mod" ? "moderator" : "public_safe";
}
function splitList(value) {
if (Array.isArray(value)) return value.map((item) => cleanText(item, 120)).filter(Boolean).slice(0, 50);
return String(value || "")
.split(",")
.map((item) => cleanText(item, 120))
.filter(Boolean)
.slice(0, 50);
}
function titleFromPath(filePath) {
return path.basename(filePath, path.extname(filePath)).replace(/[-_]+/g, " ").replace(/\b\w/g, (letter) => letter.toUpperCase());
}
function slugify(value) {
return cleanText(value, 180)
.toLowerCase()
.replace(/[^a-z0-9]+/g, "-")
.replace(/^-+|-+$/g, "") || "entry";
}
function normalizePath(value) {
return String(value || "").replace(/\\/g, "/");
}
function cleanText(value, maximum = 4000) {
return String(value || "").replace(/\r\n?/g, "\n").trim().slice(0, maximum);
}
module.exports = {
KNOWLEDGE_SCOPES,
ensureKnowledgeDirs,
getCommunityKnowledgeFile,
knowledgeRoot,
listKnowledgeFiles,
listCommunityKnowledgeFiles,
listKnowledgePlaceholders,
loadKnowledgeEntries,
migrateSingleBracePlaceholders,
parseKnowledgeFile,
registerKnowledgePlaceholderDefinitions,
saveCorrectionKnowledgeFile,
saveCommunityKnowledgeFile,
searchFileKnowledge
};