FIX: Make summaries backfill job more resilient. (#1071)
To quickly select backfill candidates without comparing SHAs, we compare the last summarized post to the topic's highest_post_number. However, hiding or deleting a post and adding a small action will update this column, causing the job to stall and re-generate the same summary repeatedly until someone posts a regular reply. On top of this, this is not always true for topics with `best_replies`, as this last reply isn't necessarily included. Since this is not evident at first glance and each summarization strategy picks its targets differently, I'm opting to simplify the backfill logic and how we track potential candidates. The first step is dropping `content_range`, which serves no purpose and it's there because summary caching was supposed to work differently at the beginning. So instead, I'm replacing it with a column called `highest_target_number`, which tracks `highest_post_number` for topics and could track other things like channel's `message_count` in the future. Now that we have this column when selecting every potential backfill candidate, we'll check if the summary is truly outdated by comparing the SHAs, and if it's not, we just update the column and move on
This commit is contained in:
parent
f9aa2de413
commit
46fcdb6ba5
|
@ -16,7 +16,7 @@ module ::Jobs
|
|||
gist = summarizer.existing_summary
|
||||
return if gist.present? && (!gist.outdated || gist.created_at >= 5.minutes.ago)
|
||||
|
||||
summarizer.force_summarize(Discourse.system_user)
|
||||
summarizer.summarize(Discourse.system_user)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -18,7 +18,8 @@ module ::Jobs
|
|||
backfill_candidates(gist_t)
|
||||
.limit(current_budget(gist_t))
|
||||
.each do |topic|
|
||||
DiscourseAi::Summarization.topic_gist(topic).force_summarize(system_user)
|
||||
strategy = DiscourseAi::Summarization.topic_gist(topic)
|
||||
try_summarize(strategy, system_user, topic)
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -26,10 +27,25 @@ module ::Jobs
|
|||
backfill_candidates(complete_t)
|
||||
.limit(current_budget(complete_t))
|
||||
.each do |topic|
|
||||
DiscourseAi::Summarization.topic_summary(topic).force_summarize(system_user)
|
||||
strategy = DiscourseAi::Summarization.topic_summary(topic)
|
||||
try_summarize(strategy, system_user, topic)
|
||||
end
|
||||
end
|
||||
|
||||
def try_summarize(strategy, user, topic)
|
||||
existing_summary = strategy.existing_summary
|
||||
|
||||
if existing_summary.blank? || existing_summary.outdated
|
||||
strategy.summarize(user)
|
||||
else
|
||||
# Hiding or deleting a post, and creating a small action alters the Topic#highest_post_number.
|
||||
# We use this as a quick way to select potential backfill candidates without relying on original_content_sha.
|
||||
# At this point, we are confident the summary doesn't need to be regenerated so something other than a regular reply
|
||||
# caused the number to change in the topic.
|
||||
existing_summary.update!(highest_target_number: topic.highest_post_number)
|
||||
end
|
||||
end
|
||||
|
||||
def backfill_candidates(summary_type)
|
||||
max_age_days = SiteSetting.ai_summary_backfill_topic_max_age_days
|
||||
|
||||
|
@ -45,12 +61,12 @@ module ::Jobs
|
|||
.where(
|
||||
<<~SQL, # (1..1) gets stored ad (1..2).
|
||||
ais.id IS NULL OR (
|
||||
UPPER(ais.content_range) < topics.highest_post_number + 1
|
||||
AND ais.created_at < (current_timestamp - INTERVAL '5 minutes')
|
||||
ais.highest_target_number < topics.highest_post_number
|
||||
AND ais.updated_at < (current_timestamp - INTERVAL '5 minutes')
|
||||
)
|
||||
SQL
|
||||
)
|
||||
.order("ais.created_at DESC NULLS FIRST, topics.last_posted_at DESC")
|
||||
.order("ais.updated_at DESC NULLS FIRST, topics.last_posted_at DESC")
|
||||
end
|
||||
|
||||
def current_budget(type)
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
class AiSummary < ActiveRecord::Base
|
||||
# TODO remove this line 01-3-2025
|
||||
self.ignored_columns = %i[content_range]
|
||||
|
||||
belongs_to :target, polymorphic: true
|
||||
|
||||
enum :summary_type, { complete: 0, gist: 1 }
|
||||
|
@ -15,14 +18,20 @@ class AiSummary < ActiveRecord::Base
|
|||
target_id: strategy.target.id,
|
||||
target_type: strategy.target.class.name,
|
||||
algorithm: llm_model.name,
|
||||
content_range: (content_ids.first..content_ids.last),
|
||||
highest_target_number: strategy.highest_target_number,
|
||||
summarized_text: summary,
|
||||
original_content_sha: build_sha(content_ids.join),
|
||||
summary_type: strategy.type,
|
||||
origin: !!human ? origins[:human] : origins[:system],
|
||||
},
|
||||
unique_by: %i[target_id target_type summary_type],
|
||||
update_only: %i[summarized_text original_content_sha algorithm origin content_range],
|
||||
update_only: %i[
|
||||
summarized_text
|
||||
original_content_sha
|
||||
algorithm
|
||||
origin
|
||||
highest_target_number
|
||||
],
|
||||
)
|
||||
.first
|
||||
.then { AiSummary.find_by(id: _1["id"]) }
|
||||
|
@ -45,17 +54,17 @@ end
|
|||
#
|
||||
# Table name: ai_summaries
|
||||
#
|
||||
# id :bigint not null, primary key
|
||||
# target_id :integer not null
|
||||
# target_type :string not null
|
||||
# content_range :int4range
|
||||
# summarized_text :string not null
|
||||
# original_content_sha :string not null
|
||||
# algorithm :string not null
|
||||
# created_at :datetime not null
|
||||
# updated_at :datetime not null
|
||||
# summary_type :integer default("complete"), not null
|
||||
# origin :integer
|
||||
# id :bigint not null, primary key
|
||||
# target_id :integer not null
|
||||
# target_type :string not null
|
||||
# summarized_text :string not null
|
||||
# original_content_sha :string not null
|
||||
# algorithm :string not null
|
||||
# created_at :datetime not null
|
||||
# updated_at :datetime not null
|
||||
# summary_type :integer default("complete"), not null
|
||||
# origin :integer
|
||||
# highest_target_number :integer not null
|
||||
#
|
||||
# Indexes
|
||||
#
|
||||
|
|
|
@ -13,15 +13,6 @@ class AiTopicSummarySerializer < ApplicationSerializer
|
|||
end
|
||||
|
||||
def new_posts_since_summary
|
||||
# Postgres uses discrete range types for int4range, which means
|
||||
# (1..2) is stored as (1...3).
|
||||
#
|
||||
# We use Range#max to work around this, which in the case above always returns 2.
|
||||
# Be careful with using Range#end here, it could lead to unexpected results as:
|
||||
#
|
||||
# (1..2).end => 2
|
||||
# (1...3).end => 3
|
||||
|
||||
object.target.highest_post_number.to_i - object.content_range&.max.to_i
|
||||
object.target.highest_post_number.to_i - object.highest_target_number.to_i
|
||||
end
|
||||
end
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
# frozen_string_literal: true
|
||||
class AddHighestTargetNumberToAiSummary < ActiveRecord::Migration[7.2]
|
||||
def up
|
||||
add_column :ai_summaries, :highest_target_number, :integer, null: false
|
||||
|
||||
execute <<~SQL
|
||||
UPDATE ai_summaries SET highest_target_number = GREATEST(UPPER(content_range) - 1, 1)
|
||||
SQL
|
||||
end
|
||||
|
||||
def down
|
||||
drop_column :ai_summaries, :highest_target_number
|
||||
end
|
||||
end
|
|
@ -0,0 +1,12 @@
|
|||
# frozen_string_literal: true
|
||||
class DropAiSummariesContentRange < ActiveRecord::Migration[7.2]
|
||||
DROPPED_COLUMNS ||= { ai_summaries: %i[content_range] }
|
||||
|
||||
def up
|
||||
DROPPED_COLUMNS.each { |table, columns| Migration::ColumnDropper.execute_drop(table, columns) }
|
||||
end
|
||||
|
||||
def down
|
||||
raise ActiveRecord::IrreversibleMigration
|
||||
end
|
||||
end
|
|
@ -21,6 +21,8 @@ module DiscourseAi
|
|||
# @param &on_partial_blk { Block - Optional } - The passed block will get called with the LLM partial response alongside a cancel function.
|
||||
# Note: The block is only called with results of the final summary, not intermediate summaries.
|
||||
#
|
||||
# This method doesn't care if we already have an up to date summary. It always regenerate.
|
||||
#
|
||||
# @returns { AiSummary } - Resulting summary.
|
||||
def summarize(user, &on_partial_blk)
|
||||
base_summary = ""
|
||||
|
@ -68,11 +70,6 @@ module DiscourseAi
|
|||
AiSummary.where(target: strategy.target, summary_type: strategy.type).destroy_all
|
||||
end
|
||||
|
||||
def force_summarize(user, &on_partial_blk)
|
||||
@persist_summaries = true
|
||||
summarize(user, &on_partial_blk)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
attr_reader :persist_summaries
|
||||
|
|
|
@ -8,6 +8,10 @@ module DiscourseAi
|
|||
AiSummary.summary_types[:complete]
|
||||
end
|
||||
|
||||
def highest_target_number
|
||||
nil # We don't persist so we can return nil.
|
||||
end
|
||||
|
||||
def initialize(target, since)
|
||||
super(target)
|
||||
@since = since
|
||||
|
|
|
@ -12,6 +12,10 @@ module DiscourseAi
|
|||
"gists"
|
||||
end
|
||||
|
||||
def highest_target_number
|
||||
target.highest_post_number
|
||||
end
|
||||
|
||||
def targets_data
|
||||
op_post_number = 1
|
||||
|
||||
|
@ -38,6 +42,7 @@ module DiscourseAi
|
|||
.limit(20)
|
||||
.pluck(:post_number)
|
||||
end
|
||||
|
||||
posts_data =
|
||||
Post
|
||||
.where(topic_id: target.id)
|
||||
|
|
|
@ -8,6 +8,10 @@ module DiscourseAi
|
|||
AiSummary.summary_types[:complete]
|
||||
end
|
||||
|
||||
def highest_target_number
|
||||
target.highest_post_number
|
||||
end
|
||||
|
||||
def targets_data
|
||||
posts_data =
|
||||
(target.has_summary? ? best_replies : pick_selection).pluck(
|
||||
|
|
|
@ -7,6 +7,7 @@ Fabricator(:ai_summary) do
|
|||
target { Fabricate(:topic) }
|
||||
summary_type AiSummary.summary_types[:complete]
|
||||
origin AiSummary.origins[:human]
|
||||
highest_target_number 1
|
||||
end
|
||||
|
||||
Fabricator(:topic_ai_gist, from: :ai_summary) do
|
||||
|
|
|
@ -44,13 +44,13 @@ RSpec.describe Jobs::SummariesBackfill do
|
|||
end
|
||||
|
||||
it "ignores up to date summaries" do
|
||||
Fabricate(:ai_summary, target: topic, content_range: (1..2))
|
||||
Fabricate(:ai_summary, target: topic, highest_target_number: 2, updated_at: 10.minutes.ago)
|
||||
|
||||
expect(subject.backfill_candidates(type)).to be_empty
|
||||
end
|
||||
|
||||
it "ignores outdated summaries created less than five minutes ago" do
|
||||
Fabricate(:ai_summary, target: topic, content_range: (1..1), created_at: 4.minutes.ago)
|
||||
it "ignores outdated summaries updated less than five minutes ago" do
|
||||
Fabricate(:ai_summary, target: topic, highest_target_number: 1, updated_at: 4.minutes.ago)
|
||||
|
||||
expect(subject.backfill_candidates(type)).to be_empty
|
||||
end
|
||||
|
@ -66,7 +66,7 @@ RSpec.describe Jobs::SummariesBackfill do
|
|||
topic_2 =
|
||||
Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago, highest_post_number: 1)
|
||||
topic.update!(last_posted_at: 1.minute.ago)
|
||||
Fabricate(:ai_summary, target: topic, created_at: 1.hour.ago, content_range: (1..1))
|
||||
Fabricate(:ai_summary, target: topic, updated_at: 1.hour.ago, highest_target_number: 1)
|
||||
|
||||
expect(subject.backfill_candidates(type).map(&:id)).to contain_exactly(topic_2.id, topic.id)
|
||||
end
|
||||
|
@ -84,13 +84,13 @@ RSpec.describe Jobs::SummariesBackfill do
|
|||
topic_2 =
|
||||
Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago, highest_post_number: 1)
|
||||
topic.update!(last_posted_at: 1.minute.ago)
|
||||
Fabricate(:ai_summary, target: topic, created_at: 3.hours.ago, content_range: (1..1))
|
||||
Fabricate(:topic_ai_gist, target: topic, created_at: 3.hours.ago, content_range: (1..1))
|
||||
Fabricate(:ai_summary, target: topic, updated_at: 3.hours.ago, highest_target_number: 1)
|
||||
Fabricate(:topic_ai_gist, target: topic, updated_at: 3.hours.ago, highest_target_number: 1)
|
||||
|
||||
summary_1 = "Summary of topic_2"
|
||||
gist_1 = "Gist of topic_2"
|
||||
summary_2 = "Summary of topic"
|
||||
gist_2 = "Gist of topic"
|
||||
summary_2 = "Updated summary of topic"
|
||||
gist_2 = "Updated gist of topic"
|
||||
|
||||
DiscourseAi::Completions::Llm.with_prepared_responses(
|
||||
[gist_1, gist_2, summary_1, summary_2],
|
||||
|
@ -100,6 +100,32 @@ RSpec.describe Jobs::SummariesBackfill do
|
|||
expect(AiSummary.gist.find_by(target: topic_2).summarized_text).to eq(gist_1)
|
||||
expect(AiSummary.complete.find_by(target: topic).summarized_text).to eq(summary_2)
|
||||
expect(AiSummary.gist.find_by(target: topic).summarized_text).to eq(gist_2)
|
||||
|
||||
# Queue has to be empty if we just generated all summaries
|
||||
expect(subject.backfill_candidates(AiSummary.summary_types[:complete])).to be_empty
|
||||
expect(subject.backfill_candidates(AiSummary.summary_types[:gist])).to be_empty
|
||||
|
||||
# Queue still empty when they are up to date and time passes.
|
||||
AiSummary.update_all(updated_at: 20.minutes.ago)
|
||||
expect(subject.backfill_candidates(AiSummary.summary_types[:complete])).to be_empty
|
||||
expect(subject.backfill_candidates(AiSummary.summary_types[:gist])).to be_empty
|
||||
end
|
||||
|
||||
it "updates the highest_target_number if the summary turned to be up to date" do
|
||||
existing_summary =
|
||||
Fabricate(
|
||||
:ai_summary,
|
||||
target: topic,
|
||||
updated_at: 3.hours.ago,
|
||||
highest_target_number: topic.highest_post_number,
|
||||
)
|
||||
og_highest_post_number = topic.highest_post_number
|
||||
topic.update!(highest_post_number: og_highest_post_number + 1)
|
||||
|
||||
# No prepared responses here. We don't perform a completion call.
|
||||
subject.execute({})
|
||||
|
||||
expect(existing_summary.reload.highest_target_number).to eq(og_highest_post_number + 1)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -26,13 +26,7 @@ describe DiscourseAi::GuardianExtensions do
|
|||
end
|
||||
|
||||
it "returns true if there is a cached summary" do
|
||||
AiSummary.create!(
|
||||
target: topic,
|
||||
summarized_text: "test",
|
||||
original_content_sha: "123",
|
||||
algorithm: "test",
|
||||
summary_type: AiSummary.summary_types[:complete],
|
||||
)
|
||||
Fabricate(:ai_summary, target: topic)
|
||||
|
||||
expect(guardian.can_see_summary?(topic)).to eq(true)
|
||||
end
|
||||
|
@ -66,13 +60,7 @@ describe DiscourseAi::GuardianExtensions do
|
|||
end
|
||||
|
||||
it "returns true for anons when there is a cached summary" do
|
||||
AiSummary.create!(
|
||||
target: topic,
|
||||
summarized_text: "test",
|
||||
original_content_sha: "123",
|
||||
algorithm: "test",
|
||||
summary_type: AiSummary.summary_types[:complete],
|
||||
)
|
||||
Fabricate(:ai_summary, target: topic)
|
||||
|
||||
expect(guardian.can_see_summary?(topic)).to eq(true)
|
||||
end
|
||||
|
|
|
@ -13,15 +13,7 @@ RSpec.describe DiscourseAi::Summarization::SummaryController do
|
|||
|
||||
context "when streaming" do
|
||||
it "return a cached summary with json payload and does not trigger job if it exists" do
|
||||
section =
|
||||
AiSummary.create!(
|
||||
target: topic,
|
||||
summarized_text: "test",
|
||||
algorithm: "test",
|
||||
original_content_sha: "test",
|
||||
summary_type: AiSummary.summary_types[:complete],
|
||||
)
|
||||
|
||||
summary = Fabricate(:ai_summary, target: topic)
|
||||
sign_in(Fabricate(:admin))
|
||||
|
||||
get "/discourse-ai/summarization/t/#{topic.id}.json?stream=true"
|
||||
|
@ -29,8 +21,10 @@ RSpec.describe DiscourseAi::Summarization::SummaryController do
|
|||
expect(response.status).to eq(200)
|
||||
expect(Jobs::StreamTopicAiSummary.jobs.size).to eq(0)
|
||||
|
||||
summary = response.parsed_body
|
||||
expect(summary.dig("ai_topic_summary", "summarized_text")).to eq(section.summarized_text)
|
||||
response_summary = response.parsed_body
|
||||
expect(response_summary.dig("ai_topic_summary", "summarized_text")).to eq(
|
||||
summary.summarized_text,
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -42,21 +36,15 @@ RSpec.describe DiscourseAi::Summarization::SummaryController do
|
|||
end
|
||||
|
||||
it "returns a cached summary" do
|
||||
section =
|
||||
AiSummary.create!(
|
||||
target: topic,
|
||||
summarized_text: "test",
|
||||
algorithm: "test",
|
||||
original_content_sha: "test",
|
||||
summary_type: AiSummary.summary_types[:complete],
|
||||
)
|
||||
|
||||
summary = Fabricate(:ai_summary, target: topic)
|
||||
get "/discourse-ai/summarization/t/#{topic.id}.json"
|
||||
|
||||
expect(response.status).to eq(200)
|
||||
|
||||
summary = response.parsed_body
|
||||
expect(summary.dig("ai_topic_summary", "summarized_text")).to eq(section.summarized_text)
|
||||
response_summary = response.parsed_body
|
||||
expect(response_summary.dig("ai_topic_summary", "summarized_text")).to eq(
|
||||
summary.summarized_text,
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -90,15 +78,15 @@ RSpec.describe DiscourseAi::Summarization::SummaryController do
|
|||
get "/discourse-ai/summarization/t/#{topic.id}.json"
|
||||
|
||||
expect(response.status).to eq(200)
|
||||
summary = response.parsed_body["ai_topic_summary"]
|
||||
section = AiSummary.last
|
||||
response_summary = response.parsed_body["ai_topic_summary"]
|
||||
summary = AiSummary.last
|
||||
|
||||
expect(section.summarized_text).to eq(summary_text)
|
||||
expect(summary["summarized_text"]).to eq(section.summarized_text)
|
||||
expect(summary["algorithm"]).to eq("fake")
|
||||
expect(summary["outdated"]).to eq(false)
|
||||
expect(summary["can_regenerate"]).to eq(true)
|
||||
expect(summary["new_posts_since_summary"]).to be_zero
|
||||
expect(summary.summarized_text).to eq(summary_text)
|
||||
expect(response_summary["summarized_text"]).to eq(summary.summarized_text)
|
||||
expect(response_summary["algorithm"]).to eq("fake")
|
||||
expect(response_summary["outdated"]).to eq(false)
|
||||
expect(response_summary["can_regenerate"]).to eq(true)
|
||||
expect(response_summary["new_posts_since_summary"]).to be_zero
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -129,21 +117,16 @@ RSpec.describe DiscourseAi::Summarization::SummaryController do
|
|||
end
|
||||
|
||||
it "returns a cached summary" do
|
||||
section =
|
||||
AiSummary.create!(
|
||||
target: topic,
|
||||
summarized_text: "test",
|
||||
algorithm: "test",
|
||||
original_content_sha: "test",
|
||||
summary_type: AiSummary.summary_types[:complete],
|
||||
)
|
||||
summary = Fabricate(:ai_summary, target: topic)
|
||||
|
||||
get "/discourse-ai/summarization/t/#{topic.id}.json"
|
||||
|
||||
expect(response.status).to eq(200)
|
||||
|
||||
summary = response.parsed_body
|
||||
expect(summary.dig("ai_topic_summary", "summarized_text")).to eq(section.summarized_text)
|
||||
response_summary = response.parsed_body
|
||||
expect(response_summary.dig("ai_topic_summary", "summarized_text")).to eq(
|
||||
summary.summarized_text,
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -20,7 +20,7 @@ describe DiscourseAi::TopicSummarization do
|
|||
cached_summary =
|
||||
AiSummary.find_by(target: topic, summary_type: AiSummary.summary_types[:complete])
|
||||
|
||||
expect(cached_summary.content_range).to cover(*topic.posts.map(&:post_number))
|
||||
expect(cached_summary.highest_target_number).to eq(topic.highest_post_number)
|
||||
expect(cached_summary.summarized_text).to eq(summary)
|
||||
expect(cached_summary.original_content_sha).to be_present
|
||||
expect(cached_summary.algorithm).to eq("fake")
|
||||
|
|
|
@ -16,6 +16,8 @@ RSpec.describe "Summarize a topic ", type: :system do
|
|||
let(:topic_page) { PageObjects::Pages::Topic.new }
|
||||
let(:summary_box) { PageObjects::Components::AiSummaryTrigger.new }
|
||||
|
||||
fab!(:ai_summary) { Fabricate(:ai_summary, target: topic, summarized_text: "This is a summary") }
|
||||
|
||||
before do
|
||||
group.add(current_user)
|
||||
|
||||
|
@ -27,16 +29,6 @@ RSpec.describe "Summarize a topic ", type: :system do
|
|||
end
|
||||
|
||||
context "when a summary is cached" do
|
||||
before do
|
||||
AiSummary.create!(
|
||||
target: topic,
|
||||
summarized_text: summarization_result,
|
||||
algorithm: "test",
|
||||
original_content_sha: "test",
|
||||
summary_type: AiSummary.summary_types[:complete],
|
||||
)
|
||||
end
|
||||
|
||||
it "displays it" do
|
||||
topic_page.visit_topic(topic)
|
||||
summary_box.click_summarize
|
||||
|
@ -45,15 +37,6 @@ RSpec.describe "Summarize a topic ", type: :system do
|
|||
end
|
||||
|
||||
context "when a summary is outdated" do
|
||||
before do
|
||||
AiSummary.create!(
|
||||
target: topic,
|
||||
summarized_text: summarization_result,
|
||||
algorithm: "test",
|
||||
original_content_sha: "test",
|
||||
summary_type: AiSummary.summary_types[:complete],
|
||||
)
|
||||
end
|
||||
fab!(:new_post) do
|
||||
Fabricate(
|
||||
:post,
|
||||
|
|
Loading…
Reference in New Issue