Lumi/plugins/lumi_ai/backend/evals.js
2026-06-12 19:27:43 +02:00

142 lines
4.4 KiB
JavaScript

const fs = require("fs");
const crypto = require("crypto");
const { resolveData } = require("./paths");
const { atomicJson, paginate } = require("./feedback");
class EvalStore {
constructor(options = {}) {
this.casesFile = options.casesFile || resolveData("evals", "cases.json");
this.resultsFile = options.resultsFile || resolveData("evals", "results.json");
}
add(values, actor) {
const prompt = clean(values.prompt, 6000);
if (!prompt) throw new Error("Eval prompt is required.");
const entry = {
id: crypto.randomUUID(),
prompt,
role: normalizeRole(values.role),
origin: clean(values.origin, 80) || "webui",
expected_behavior: clean(values.expected_behavior, 8000),
forbidden_behavior: clean(values.forbidden_behavior, 8000),
expected_link: clean(values.expected_link, 2000),
notes: clean(values.notes, 4000),
created_by: String(actor.id),
created_at: new Date().toISOString()
};
const store = this.readCases();
store.entries.unshift(entry);
atomicJson(this.casesFile, store);
return entry;
}
list({ page = 1, pageSize = 20 } = {}) {
return paginate(this.readCases().entries, page, pageSize);
}
results(limit = 100) {
return this.readResults().entries.slice(0, limit);
}
delete(id) {
const store = this.readCases();
const before = store.entries.length;
store.entries = store.entries.filter((entry) => entry.id !== id);
if (store.entries.length === before) return false;
atomicJson(this.casesFile, store);
return true;
}
async runAll({ provider, actor }) {
const results = [];
for (const testCase of this.readCases().entries) {
const simulatedUser = {
id: `eval:${actor.id}`,
username: "lumi-eval",
isAdmin: testCase.role === "admin",
isMod: testCase.role === "mod"
};
try {
const response = await provider.generate({
message: testCase.prompt,
user: simulatedUser,
sessionId: `eval:${testCase.id}:${Date.now()}`,
scope: "eval",
originContext: {
origin: testCase.origin,
platform: testCase.origin,
role: testCase.role,
permission_context: { webui_actions_allowed: false }
}
});
results.push(evaluateCase(testCase, response.text, response.links));
} catch (error) {
results.push({
case_id: testCase.id,
prompt: testCase.prompt,
status: "manual_review",
error: error.message,
run_at: new Date().toISOString()
});
}
}
const store = this.readResults();
store.entries = [...results, ...store.entries].slice(0, 1000);
atomicJson(this.resultsFile, store);
return results;
}
readCases() {
return readStore(this.casesFile);
}
readResults() {
return readStore(this.resultsFile);
}
}
function evaluateCase(testCase, answer, links = []) {
const text = String(answer || "");
const expected = splitChecks(testCase.expected_behavior);
const forbidden = splitChecks(testCase.forbidden_behavior);
const expectedPass = expected.every((check) => text.toLowerCase().includes(check.toLowerCase()));
const forbiddenPass = forbidden.every((check) => !text.toLowerCase().includes(check.toLowerCase()));
const linkPass = !testCase.expected_link ||
text.includes(testCase.expected_link) ||
links.some((link) => link.href === testCase.expected_link);
const hasAutomatedChecks = expected.length || forbidden.length || testCase.expected_link;
return {
case_id: testCase.id,
prompt: testCase.prompt,
status: !hasAutomatedChecks ? "manual_review" : expectedPass && forbiddenPass && linkPass ? "pass" : "fail",
expected_pass: expectedPass,
forbidden_pass: forbiddenPass,
link_pass: linkPass,
answer: text.slice(0, 16000),
run_at: new Date().toISOString()
};
}
function splitChecks(value) {
return String(value || "").split(/\r?\n|;/).map((entry) => entry.trim()).filter(Boolean);
}
function readStore(file) {
try {
const parsed = JSON.parse(fs.readFileSync(file, "utf8"));
return { entries: Array.isArray(parsed.entries) ? parsed.entries : [] };
} catch {
return { entries: [] };
}
}
function clean(value, max) {
return String(value || "").trim().slice(0, max);
}
function normalizeRole(value) {
return ["admin", "mod", "user"].includes(value) ? value : "user";
}
module.exports = { EvalStore, evaluateCase };