discourse-ai/lib/translation/content_splitter.rb

110 lines
2.9 KiB
Ruby

# frozen_string_literal: true
module DiscourseAi
module Translation
class ContentSplitter
CHUNK_SIZE = 3000
BBCODE_PATTERNS = [
%r{\[table.*?\].*?\[/table\]}m,
%r{\[quote.*?\].*?\[/quote\]}m,
%r{\[details.*?\].*?\[/details\]}m,
%r{\<details.*?\>.*?\</details\>}m,
%r{\[spoiler.*?\].*?\[/spoiler\]}m,
%r{\[code.*?\].*?\[/code\]}m,
/```.*?```/m,
].freeze
TEXT_BOUNDARIES = [
/\n\s*\n\s*|\r\n\s*\r\n\s*/, # double newlines with optional spaces
/[.!?]\s+/, # sentence endings
/[,;]\s+/, # clause endings
/\n|\r\n/, # single newlines
/\s+/, # any whitespace
].freeze
def self.split(content)
return [] if content.nil?
return [""] if content.empty?
return [content] if content.length <= CHUNK_SIZE
chunks = []
remaining = content.dup
while remaining.present?
chunk = extract_mixed_chunk(remaining)
break if chunk.empty?
chunks << chunk
remaining = remaining[chunk.length..-1]
end
chunks
end
private
def self.extract_mixed_chunk(text, size: CHUNK_SIZE)
return text if text.length <= size
flexible_size = size * 1.5
# try each splitting strategy in order
split_point =
[
-> { find_nearest_html_end_index(text, size) },
-> { find_nearest_bbcode_end_index(text, size) },
-> { find_text_boundary(text, size) },
-> { size },
].lazy.map(&:call).compact.find { |pos| pos <= flexible_size }
text[0...split_point]
end
def self.find_nearest_html_end_index(text, target_pos)
return nil if !text.include?("<")
begin
doc = Nokogiri::HTML5.fragment(text)
current_length = 0
doc.children.each do |node|
html = node.to_html
end_pos = current_length + html.length
return end_pos if end_pos > target_pos
current_length = end_pos
end
nil
rescue Nokogiri::SyntaxError
nil
end
end
def self.find_nearest_bbcode_end_index(text, target_pos)
BBCODE_PATTERNS.each do |pattern|
text.scan(pattern) do |_|
match = $~
tag_start = match.begin(0)
tag_end = match.end(0)
return tag_end if tag_start <= target_pos && tag_end > target_pos
end
end
nil
end
def self.find_text_boundary(text, target_pos)
search_text = text
TEXT_BOUNDARIES.each do |pattern|
if pos = search_text.rindex(pattern, target_pos)
# Include all trailing whitespace
pos += 1 while pos < search_text.length && search_text[pos].match?(/\s/)
return pos
end
end
nil
end
end
end
end