discourse-ai/lib/translation/content_splitter.rb

# frozen_string_literal: true

module DiscourseAi
  module Translation
    class ContentSplitter
      CHUNK_SIZE = 3000

      BBCODE_PATTERNS = [
        %r{\[table.*?\].*?\[/table\]}m,
        %r{\[quote.*?\].*?\[/quote\]}m,
        %r{\[details.*?\].*?\[/details\]}m,
        %r{\<details.*?\>.*?\</details\>}m,
        %r{\[spoiler.*?\].*?\[/spoiler\]}m,
        %r{\[code.*?\].*?\[/code\]}m,
        /```.*?```/m,
      ].freeze

      TEXT_BOUNDARIES = [
        /\n\s*\n\s*|\r\n\s*\r\n\s*/, # double newlines with optional spaces
        /[.!?]\s+/, # sentence endings
        /[,;]\s+/, # clause endings
        /\n|\r\n/, # single newlines
        /\s+/, # any whitespace
      ].freeze

      def self.split(content)
        return [] if content.nil?
        return [""] if content.empty?
        return [content] if content.length <= CHUNK_SIZE

        chunks = []
        remaining = content.dup

        while remaining.present?
          chunk = extract_mixed_chunk(remaining)
          break if chunk.empty?
          chunks << chunk
          remaining = remaining[chunk.length..-1]
        end

        chunks
      end

      private

      def self.extract_mixed_chunk(text, size: CHUNK_SIZE)
        return text if text.length <= size
        flexible_size = size * 1.5

        # try each splitting strategy in order
        split_point =
          [
            -> { find_nearest_html_end_index(text, size) },
            -> { find_nearest_bbcode_end_index(text, size) },
            -> { find_text_boundary(text, size) },
            -> { size },
          ].lazy.map(&:call).compact.find { |pos| pos <= flexible_size }

        text[0...split_point]
      end

      def self.find_nearest_html_end_index(text, target_pos)
        return nil if !text.include?("<")

        begin
          doc = Nokogiri::HTML5.fragment(text)
          current_length = 0

          doc.children.each do |node|
            html = node.to_html
            end_pos = current_length + html.length
            return end_pos if end_pos > target_pos
            current_length = end_pos
          end
          nil
        rescue Nokogiri::SyntaxError
          nil
        end
      end

      def self.find_nearest_bbcode_end_index(text, target_pos)
        BBCODE_PATTERNS.each do |pattern|
          text.scan(pattern) do |_|
            match = $~
            tag_start = match.begin(0)
            tag_end = match.end(0)

            return tag_end if tag_start <= target_pos && tag_end > target_pos
          end
        end

        nil
      end

      def self.find_text_boundary(text, target_pos)
        search_text = text

        TEXT_BOUNDARIES.each do |pattern|
          if pos = search_text.rindex(pattern, target_pos)
            # Include all trailing whitespace
            pos += 1 while pos < search_text.length && search_text[pos].match?(/\s/)
            return pos
          end
        end
        nil
      end
    end
  end
end