discourse-ai/lib/utils/diff_utils/safety_checker.rb

92 lines
2.5 KiB
Ruby

# frozen_string_literal: true
require "cgi"
module DiscourseAi
module Utils
module DiffUtils
class SafetyChecker
def self.safe_to_stream?(html_text)
new(html_text).safe?
end
def initialize(html_text)
@original_html = html_text
@text = sanitize(html_text)
end
def safe?
return false if unclosed_markdown_links?
return false if unclosed_raw_html_tag?
return false if trailing_incomplete_url?
return false if unclosed_backticks?
return false if unbalanced_bold_or_italic?
return false if incomplete_image_markdown?
return false if unbalanced_quote_blocks?
return false if unclosed_triple_backticks?
return false if partial_emoji?
true
end
private
def sanitize(html)
text = html.gsub(%r{</?[^>]+>}, "") # remove tags like <span>, <del>, etc.
CGI.unescapeHTML(text)
end
def unclosed_markdown_links?
open_brackets = @text.count("[")
close_brackets = @text.count("]")
open_parens = @text.count("(")
close_parens = @text.count(")")
open_brackets != close_brackets || open_parens != close_parens
end
def unclosed_raw_html_tag?
last_lt = @text.rindex("<")
last_gt = @text.rindex(">")
last_lt && (!last_gt || last_gt < last_lt)
end
def trailing_incomplete_url?
last_word = @text.split(/\s/).last
last_word =~ %r{\Ahttps?://[^\s]*\z} && last_word !~ /[)\].,!?:;'"]\z/
end
def unclosed_backticks?
@text.count("`").odd?
end
def unbalanced_bold_or_italic?
@text.scan(/\*\*/).count.odd? || @text.scan(/\*(?!\*)/).count.odd? ||
@text.scan(/_/).count.odd?
end
def incomplete_image_markdown?
last_image = @text[/!\[.*?\]\(.*?$/, 0]
last_image && last_image[-1] != ")"
end
def unbalanced_quote_blocks?
opens = @text.scan(/\[quote(=.*?)?\]/i).count
closes = @text.scan(%r{\[/quote\]}i).count
opens > closes
end
def unclosed_triple_backticks?
@text.scan(/```/).count.odd?
end
def partial_emoji?
text = @text.gsub(/!\[.*?\]\(.*?\)/, "").gsub(%r{https?://[^\s]+}, "")
tokens = text.scan(/:[a-z0-9_+\-\.]+:?/i)
tokens.any? { |token| token.start_with?(":") && !token.end_with?(":") }
end
end
end
end
end