142 lines
4.4 KiB
JavaScript
142 lines
4.4 KiB
JavaScript
const fs = require("fs");
|
|
const crypto = require("crypto");
|
|
const { resolveData } = require("./paths");
|
|
const { atomicJson, paginate } = require("./feedback");
|
|
|
|
class EvalStore {
|
|
constructor(options = {}) {
|
|
this.casesFile = options.casesFile || resolveData("evals", "cases.json");
|
|
this.resultsFile = options.resultsFile || resolveData("evals", "results.json");
|
|
}
|
|
|
|
add(values, actor) {
|
|
const prompt = clean(values.prompt, 6000);
|
|
if (!prompt) throw new Error("Eval prompt is required.");
|
|
const entry = {
|
|
id: crypto.randomUUID(),
|
|
prompt,
|
|
role: normalizeRole(values.role),
|
|
origin: clean(values.origin, 80) || "webui",
|
|
expected_behavior: clean(values.expected_behavior, 8000),
|
|
forbidden_behavior: clean(values.forbidden_behavior, 8000),
|
|
expected_link: clean(values.expected_link, 2000),
|
|
notes: clean(values.notes, 4000),
|
|
created_by: String(actor.id),
|
|
created_at: new Date().toISOString()
|
|
};
|
|
const store = this.readCases();
|
|
store.entries.unshift(entry);
|
|
atomicJson(this.casesFile, store);
|
|
return entry;
|
|
}
|
|
|
|
list({ page = 1, pageSize = 20 } = {}) {
|
|
return paginate(this.readCases().entries, page, pageSize);
|
|
}
|
|
|
|
results(limit = 100) {
|
|
return this.readResults().entries.slice(0, limit);
|
|
}
|
|
|
|
delete(id) {
|
|
const store = this.readCases();
|
|
const before = store.entries.length;
|
|
store.entries = store.entries.filter((entry) => entry.id !== id);
|
|
if (store.entries.length === before) return false;
|
|
atomicJson(this.casesFile, store);
|
|
return true;
|
|
}
|
|
|
|
async runAll({ provider, actor }) {
|
|
const results = [];
|
|
for (const testCase of this.readCases().entries) {
|
|
const simulatedUser = {
|
|
id: `eval:${actor.id}`,
|
|
username: "lumi-eval",
|
|
isAdmin: testCase.role === "admin",
|
|
isMod: testCase.role === "mod"
|
|
};
|
|
try {
|
|
const response = await provider.generate({
|
|
message: testCase.prompt,
|
|
user: simulatedUser,
|
|
sessionId: `eval:${testCase.id}:${Date.now()}`,
|
|
scope: "eval",
|
|
originContext: {
|
|
origin: testCase.origin,
|
|
platform: testCase.origin,
|
|
role: testCase.role,
|
|
permission_context: { webui_actions_allowed: false }
|
|
}
|
|
});
|
|
results.push(evaluateCase(testCase, response.text, response.links));
|
|
} catch (error) {
|
|
results.push({
|
|
case_id: testCase.id,
|
|
prompt: testCase.prompt,
|
|
status: "manual_review",
|
|
error: error.message,
|
|
run_at: new Date().toISOString()
|
|
});
|
|
}
|
|
}
|
|
const store = this.readResults();
|
|
store.entries = [...results, ...store.entries].slice(0, 1000);
|
|
atomicJson(this.resultsFile, store);
|
|
return results;
|
|
}
|
|
|
|
readCases() {
|
|
return readStore(this.casesFile);
|
|
}
|
|
|
|
readResults() {
|
|
return readStore(this.resultsFile);
|
|
}
|
|
}
|
|
|
|
function evaluateCase(testCase, answer, links = []) {
|
|
const text = String(answer || "");
|
|
const expected = splitChecks(testCase.expected_behavior);
|
|
const forbidden = splitChecks(testCase.forbidden_behavior);
|
|
const expectedPass = expected.every((check) => text.toLowerCase().includes(check.toLowerCase()));
|
|
const forbiddenPass = forbidden.every((check) => !text.toLowerCase().includes(check.toLowerCase()));
|
|
const linkPass = !testCase.expected_link ||
|
|
text.includes(testCase.expected_link) ||
|
|
links.some((link) => link.href === testCase.expected_link);
|
|
const hasAutomatedChecks = expected.length || forbidden.length || testCase.expected_link;
|
|
return {
|
|
case_id: testCase.id,
|
|
prompt: testCase.prompt,
|
|
status: !hasAutomatedChecks ? "manual_review" : expectedPass && forbiddenPass && linkPass ? "pass" : "fail",
|
|
expected_pass: expectedPass,
|
|
forbidden_pass: forbiddenPass,
|
|
link_pass: linkPass,
|
|
answer: text.slice(0, 16000),
|
|
run_at: new Date().toISOString()
|
|
};
|
|
}
|
|
|
|
function splitChecks(value) {
|
|
return String(value || "").split(/\r?\n|;/).map((entry) => entry.trim()).filter(Boolean);
|
|
}
|
|
|
|
function readStore(file) {
|
|
try {
|
|
const parsed = JSON.parse(fs.readFileSync(file, "utf8"));
|
|
return { entries: Array.isArray(parsed.entries) ? parsed.entries : [] };
|
|
} catch {
|
|
return { entries: [] };
|
|
}
|
|
}
|
|
|
|
function clean(value, max) {
|
|
return String(value || "").trim().slice(0, max);
|
|
}
|
|
|
|
function normalizeRole(value) {
|
|
return ["admin", "mod", "user"].includes(value) ? value : "user";
|
|
}
|
|
|
|
module.exports = { EvalStore, evaluateCase };
|