FIX: stabilize diff algorithm for streaming (#1358)

Previous to this fix we would diff the entire body of text, this
could lead to situations where a diff presented to a user was
wildly off matching areas in the text that should not have been
tested.

New algorithm only checks a portion of the string, this ensures
that during streaming there is no chance for wild mistakes
This commit is contained in:
Sam 2025-05-22 16:31:21 +10:00 committed by GitHub
parent c29183fc2d
commit 55dab9c68a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 117 additions and 1 deletions

View File

@ -4,6 +4,8 @@ import loadJSDiff from "discourse/lib/load-js-diff";
import { parseAsync } from "discourse/lib/text"; import { parseAsync } from "discourse/lib/text";
const DEFAULT_CHAR_TYPING_DELAY = 30; const DEFAULT_CHAR_TYPING_DELAY = 30;
const STREAMING_DIFF_TRUNCATE_THRESHOLD = 0.1;
const STREAMING_DIFF_TRUNCATE_BUFFER = 10;
export default class DiffStreamer { export default class DiffStreamer {
@tracked isStreaming = false; @tracked isStreaming = false;
@ -122,6 +124,53 @@ export default class DiffStreamer {
return maybeUnfinishedImage || maybeUnfinishedLink; return maybeUnfinishedImage || maybeUnfinishedLink;
} }
// this is public to make testing easier
// is makes it easier to do a "streaming diff" where we want to ensure diff
// is focused on the beginning of the text instead of taking the entire body
// into account.
// This ensures that we do not make mistakes and present wildly different diffs
// to what we would stablize on at the end of the stream.
streamingDiff(original, suggestion) {
const maxDiffLength = Math.floor(
suggestion.length +
suggestion.length * STREAMING_DIFF_TRUNCATE_THRESHOLD +
STREAMING_DIFF_TRUNCATE_BUFFER
);
const head = original.slice(0, maxDiffLength);
const tail = original.slice(maxDiffLength);
const diffArray = this.jsDiff.diffWordsWithSpace(head, suggestion);
if (tail.length > 0) {
// if last in the array is added, and previous is removed then flip them
let last = diffArray[diffArray.length - 1];
let secondLast = diffArray[diffArray.length - 2];
if (last.added && secondLast.removed) {
diffArray.pop();
diffArray.pop();
diffArray.push(last);
diffArray.push(secondLast);
last = secondLast;
secondLast = diffArray[diffArray.length - 2];
}
if (!last.removed) {
last = {
added: false,
removed: true,
value: "",
};
diffArray.push(last);
}
last.value = last.value + tail;
}
return diffArray;
}
async #streamNextChar() { async #streamNextChar() {
if (this.currentWordIndex < this.words.length) { if (this.currentWordIndex < this.words.length) {
const currentToken = this.words[this.currentWordIndex]; const currentToken = this.words[this.currentWordIndex];
@ -134,7 +183,7 @@ export default class DiffStreamer {
this.currentWordIndex++; this.currentWordIndex++;
this.currentCharIndex = 0; this.currentCharIndex = 0;
const originalDiff = this.jsDiff.diffWordsWithSpace( const originalDiff = this.streamingDiff(
this.selectedText, this.selectedText,
this.suggestion this.suggestion
); );

View File

@ -0,0 +1,67 @@
import { module, test } from "qunit";
import DiffStreamer from "discourse/plugins/discourse-ai/discourse/lib/diff-streamer";
module("Unit | Lib | diff-streamer", function () {
test("streamingDiff correctly handles trivial cases", async function (assert) {
const originalText = "helo world";
const targetText = "hello world";
const diffStreamer = new DiffStreamer(this.originalTextContent);
await diffStreamer.loadJSDiff();
const diffResult = diffStreamer.streamingDiff(originalText, targetText);
const expectedDiff = [
{ count: 1, added: false, removed: true, value: "helo" },
{ count: 1, added: true, removed: false, value: "hello" },
{ count: 2, added: false, removed: false, value: " world" },
];
assert.deepEqual(
diffResult,
expectedDiff,
"Diff result should match the expected structure"
);
});
test("streamingDiff correctly consolidates and handles diff drift", async function (assert) {
const originalText =
"This is todone, but I want to can why.\n\nWe\n\nSEO Tags supports a `canonical_url` override. I tried a few possibilities there. The one I wanted to work, ex: `https://www.discourse.org/de`, appended an extra `/de` on the URL, only in the deploy b";
const targetText = "This is to-done";
const diffStreamer = new DiffStreamer(this.originalTextContent);
await diffStreamer.loadJSDiff();
const diffResult = diffStreamer.streamingDiff(originalText, targetText);
// Verify the diff result is an array with the expected structure
assert.true(Array.isArray(diffResult), "Diff result should be an array");
assert.strictEqual(
diffResult.length,
3,
"Expecting exactly three parts in the diff result"
);
assert.strictEqual(
diffResult[0].value,
"This is ",
"First part should be unchanged"
);
assert.strictEqual(
diffResult[1].value,
"to-done",
"Second part should be an insertion"
);
assert.true(diffResult[1].added, "Second part should be an insertion");
assert.strictEqual(
diffResult[2].value.length,
235,
"Third part should include all text"
);
assert.true(diffResult[2].removed, "Third part should be a removal");
});
});