FIX: stabilize diff algorithm for streaming (#1358)
Previous to this fix we would diff the entire body of text, this could lead to situations where a diff presented to a user was wildly off matching areas in the text that should not have been tested. New algorithm only checks a portion of the string, this ensures that during streaming there is no chance for wild mistakes
This commit is contained in:
parent
c29183fc2d
commit
55dab9c68a
|
@ -4,6 +4,8 @@ import loadJSDiff from "discourse/lib/load-js-diff";
|
||||||
import { parseAsync } from "discourse/lib/text";
|
import { parseAsync } from "discourse/lib/text";
|
||||||
|
|
||||||
const DEFAULT_CHAR_TYPING_DELAY = 30;
|
const DEFAULT_CHAR_TYPING_DELAY = 30;
|
||||||
|
const STREAMING_DIFF_TRUNCATE_THRESHOLD = 0.1;
|
||||||
|
const STREAMING_DIFF_TRUNCATE_BUFFER = 10;
|
||||||
|
|
||||||
export default class DiffStreamer {
|
export default class DiffStreamer {
|
||||||
@tracked isStreaming = false;
|
@tracked isStreaming = false;
|
||||||
|
@ -122,6 +124,53 @@ export default class DiffStreamer {
|
||||||
return maybeUnfinishedImage || maybeUnfinishedLink;
|
return maybeUnfinishedImage || maybeUnfinishedLink;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// this is public to make testing easier
|
||||||
|
// is makes it easier to do a "streaming diff" where we want to ensure diff
|
||||||
|
// is focused on the beginning of the text instead of taking the entire body
|
||||||
|
// into account.
|
||||||
|
// This ensures that we do not make mistakes and present wildly different diffs
|
||||||
|
// to what we would stablize on at the end of the stream.
|
||||||
|
streamingDiff(original, suggestion) {
|
||||||
|
const maxDiffLength = Math.floor(
|
||||||
|
suggestion.length +
|
||||||
|
suggestion.length * STREAMING_DIFF_TRUNCATE_THRESHOLD +
|
||||||
|
STREAMING_DIFF_TRUNCATE_BUFFER
|
||||||
|
);
|
||||||
|
const head = original.slice(0, maxDiffLength);
|
||||||
|
const tail = original.slice(maxDiffLength);
|
||||||
|
|
||||||
|
const diffArray = this.jsDiff.diffWordsWithSpace(head, suggestion);
|
||||||
|
|
||||||
|
if (tail.length > 0) {
|
||||||
|
// if last in the array is added, and previous is removed then flip them
|
||||||
|
let last = diffArray[diffArray.length - 1];
|
||||||
|
let secondLast = diffArray[diffArray.length - 2];
|
||||||
|
|
||||||
|
if (last.added && secondLast.removed) {
|
||||||
|
diffArray.pop();
|
||||||
|
diffArray.pop();
|
||||||
|
diffArray.push(last);
|
||||||
|
diffArray.push(secondLast);
|
||||||
|
|
||||||
|
last = secondLast;
|
||||||
|
secondLast = diffArray[diffArray.length - 2];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!last.removed) {
|
||||||
|
last = {
|
||||||
|
added: false,
|
||||||
|
removed: true,
|
||||||
|
value: "",
|
||||||
|
};
|
||||||
|
diffArray.push(last);
|
||||||
|
}
|
||||||
|
|
||||||
|
last.value = last.value + tail;
|
||||||
|
}
|
||||||
|
|
||||||
|
return diffArray;
|
||||||
|
}
|
||||||
|
|
||||||
async #streamNextChar() {
|
async #streamNextChar() {
|
||||||
if (this.currentWordIndex < this.words.length) {
|
if (this.currentWordIndex < this.words.length) {
|
||||||
const currentToken = this.words[this.currentWordIndex];
|
const currentToken = this.words[this.currentWordIndex];
|
||||||
|
@ -134,7 +183,7 @@ export default class DiffStreamer {
|
||||||
this.currentWordIndex++;
|
this.currentWordIndex++;
|
||||||
this.currentCharIndex = 0;
|
this.currentCharIndex = 0;
|
||||||
|
|
||||||
const originalDiff = this.jsDiff.diffWordsWithSpace(
|
const originalDiff = this.streamingDiff(
|
||||||
this.selectedText,
|
this.selectedText,
|
||||||
this.suggestion
|
this.suggestion
|
||||||
);
|
);
|
||||||
|
|
|
@ -0,0 +1,67 @@
|
||||||
|
import { module, test } from "qunit";
|
||||||
|
import DiffStreamer from "discourse/plugins/discourse-ai/discourse/lib/diff-streamer";
|
||||||
|
|
||||||
|
module("Unit | Lib | diff-streamer", function () {
|
||||||
|
test("streamingDiff correctly handles trivial cases", async function (assert) {
|
||||||
|
const originalText = "helo world";
|
||||||
|
const targetText = "hello world";
|
||||||
|
|
||||||
|
const diffStreamer = new DiffStreamer(this.originalTextContent);
|
||||||
|
await diffStreamer.loadJSDiff();
|
||||||
|
|
||||||
|
const diffResult = diffStreamer.streamingDiff(originalText, targetText);
|
||||||
|
|
||||||
|
const expectedDiff = [
|
||||||
|
{ count: 1, added: false, removed: true, value: "helo" },
|
||||||
|
{ count: 1, added: true, removed: false, value: "hello" },
|
||||||
|
{ count: 2, added: false, removed: false, value: " world" },
|
||||||
|
];
|
||||||
|
|
||||||
|
assert.deepEqual(
|
||||||
|
diffResult,
|
||||||
|
expectedDiff,
|
||||||
|
"Diff result should match the expected structure"
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("streamingDiff correctly consolidates and handles diff drift", async function (assert) {
|
||||||
|
const originalText =
|
||||||
|
"This is todone, but I want to can why.\n\nWe\n\nSEO Tags supports a `canonical_url` override. I tried a few possibilities there. The one I wanted to work, ex: `https://www.discourse.org/de`, appended an extra `/de` on the URL, only in the deploy b";
|
||||||
|
const targetText = "This is to-done";
|
||||||
|
|
||||||
|
const diffStreamer = new DiffStreamer(this.originalTextContent);
|
||||||
|
await diffStreamer.loadJSDiff();
|
||||||
|
|
||||||
|
const diffResult = diffStreamer.streamingDiff(originalText, targetText);
|
||||||
|
|
||||||
|
// Verify the diff result is an array with the expected structure
|
||||||
|
assert.true(Array.isArray(diffResult), "Diff result should be an array");
|
||||||
|
assert.strictEqual(
|
||||||
|
diffResult.length,
|
||||||
|
3,
|
||||||
|
"Expecting exactly three parts in the diff result"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.strictEqual(
|
||||||
|
diffResult[0].value,
|
||||||
|
"This is ",
|
||||||
|
"First part should be unchanged"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.strictEqual(
|
||||||
|
diffResult[1].value,
|
||||||
|
"to-done",
|
||||||
|
"Second part should be an insertion"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.true(diffResult[1].added, "Second part should be an insertion");
|
||||||
|
|
||||||
|
assert.strictEqual(
|
||||||
|
diffResult[2].value.length,
|
||||||
|
235,
|
||||||
|
"Third part should include all text"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.true(diffResult[2].removed, "Third part should be a removal");
|
||||||
|
});
|
||||||
|
});
|
Loading…
Reference in New Issue