discourse-ai/lib/utils/image_to_text.rb

233 lines
6.4 KiB
Ruby

# frozen_string_literal: true
class DiscourseAi::Utils::ImageToText
BACKOFF_SECONDS = [5, 30, 60]
MAX_IMAGE_SIZE = 10.megabytes
class Reader
def initialize(uploads:, llm_model:, user:)
@uploads = uploads
@llm_model = llm_model
@user = user
@buffer = +""
@to_process = uploads.dup
end
# return nil if no more data
def read(length)
# for implementation simplicity we will process one image at a time
if !@buffer.empty?
part = @buffer.slice!(0, length)
return part
end
return nil if @to_process.empty?
upload = @to_process.shift
extractor =
DiscourseAi::Utils::ImageToText.new(upload: upload, llm_model: @llm_model, user: @user)
extractor.extract_text do |chunk, error|
if error
Discourse.warn_exception(
error,
message: "Discourse AI: Failed to extract text from image",
)
else
# this introduces chunk markers so discourse rag ingestion requires no overlaps
@buffer << "\n[[metadata ]]\n"
@buffer << chunk
end
end
read(length)
end
end
def self.as_fake_file(uploads:, llm_model:, user:)
# given our implementation for extracting text expect a file, return a simple object that can simulate read(size)
# and stream content
Reader.new(uploads: uploads, llm_model: llm_model, user: user)
end
def self.tesseract_installed?
if defined?(@tesseract_installed)
@tesseract_installed
else
@tesseract_installed =
begin
Discourse::Utils.execute_command("which", "tesseract")
true
rescue Discourse::Utils::CommandError
false
end
end
end
attr_reader :upload, :llm_model, :user
def initialize(upload:, llm_model:, user:, guidance_text: nil)
@upload = upload
@llm_model = llm_model
@user = user
@guidance_text = guidance_text
end
def extract_text(retries: 3)
uploads ||= @uploaded_pages
raise "must specify a block" if !block_given?
extracted = nil
error = nil
backoff = BACKOFF_SECONDS.dup
retries.times do
seconds = nil
begin
extracted = extract_text_from_page(upload)
break
rescue => e
error = e
seconds = backoff.shift || seconds
sleep(seconds)
end
end
if extracted
extracted.each { |chunk| yield(chunk) }
else
yield(nil, error)
end
extracted || []
end
private
def system_message
<<~MSG
OCR the following page into Markdown. Tables should be formatted as Github flavored markdown.
Do not surround your output with triple backticks.
Chunk the document into sections of roughly 250 - 1000 words. Our goal is to identify parts of the page with same semantic theme. These chunks will be embedded and used in a RAG pipeline.
Always prefer returning text in Markdown vs HTML.
Describe all the images and graphs you encounter.
Only return text that will assist in the querying of data. Omit text such as "I had trouble recognizing images" and so on.
Surround the chunks with <chunk> </chunk> html tags.
MSG
end
def extract_text_from_page(page)
raw_text = @guidance_text
raw_text ||= extract_text_with_tesseract(page) if self.class.tesseract_installed?
llm = llm_model.to_llm
if raw_text.present?
messages = [
{
type: :user,
content: [
"The following text was extracted from an image using OCR. Please enhance, correct, and structure this content while maintaining the original text:\n\n#{raw_text}",
{ upload_id: page.id },
],
},
]
else
messages = [
{ type: :user, content: ["Please OCR the content in the image.", { upload_id: page.id }] },
]
end
prompt = DiscourseAi::Completions::Prompt.new(system_message, messages: messages)
result = llm.generate(prompt, user: Discourse.system_user)
extract_chunks(result)
end
def extract_text_with_tesseract(page)
# return nil if we can not find tessaract binary
return nil if !self.class.tesseract_installed?
upload_path =
if page.local?
Discourse.store.path_for(page)
else
Discourse.store.download_safe(page, max_file_size_kb: MAX_IMAGE_SIZE)&.path
end
return "" if !upload_path || !File.exist?(upload_path)
tmp_output_file = Tempfile.new(%w[tesseract_output .txt])
tmp_output = tmp_output_file.path
tmp_output_file.unlink
command = [
"tesseract",
upload_path,
tmp_output.sub(/\.txt$/, ""), # Tesseract adds .txt automatically
]
success =
Discourse::Utils.execute_command(
*command,
timeout: 20.seconds,
failure_message: "Failed to OCR image with Tesseract",
)
if success && File.exist?("#{tmp_output}")
text = File.read("#{tmp_output}")
begin
File.delete("#{tmp_output}")
rescue StandardError
nil
end
text.strip
else
Rails.logger.error("Tesseract OCR failed for #{upload_path}")
""
end
rescue => e
Rails.logger.error("Error during OCR processing: #{e.message}")
""
end
def extract_chunks(text)
return [] if text.nil? || text.empty?
if text.include?("<chunk>") && text.include?("</chunk>")
chunks = []
remaining_text = text.dup
while remaining_text.length > 0
if remaining_text.start_with?("<chunk>")
# Extract chunk content
chunk_end = remaining_text.index("</chunk>")
if chunk_end
chunk = remaining_text[7..chunk_end - 1].strip
chunks << chunk unless chunk.empty?
remaining_text = remaining_text[chunk_end + 8..-1] || ""
else
# Malformed chunk - add remaining text and break
chunks << remaining_text[7..-1].strip
break
end
else
# Handle text before next chunk if it exists
next_chunk = remaining_text.index("<chunk>")
if next_chunk
text_before = remaining_text[0...next_chunk].strip
chunks << text_before unless text_before.empty?
remaining_text = remaining_text[next_chunk..-1]
else
# No more chunks - add remaining text and break
chunks << remaining_text.strip
break
end
end
end
return chunks.reject(&:empty?)
end
[text]
end
end