FEATURE: Native PDF support (#1127)

* FEATURE: Native PDF support

This amends it so we use PDF Reader gem to extract text from PDFs

* This means that our simple pdf eval passes at last

* fix spec

* skip test in CI

* test file support

* Update lib/utils/image_to_text.rb

Co-authored-by: Alan Guo Xiang Tan <gxtan1990@gmail.com>

* address pr comments

---------

Co-authored-by: Alan Guo Xiang Tan <gxtan1990@gmail.com>
This commit is contained in:
Sam 2025-02-18 09:22:57 +11:00 committed by GitHub
parent 9a6aec2cf6
commit ce79a18790
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 248 additions and 62 deletions

View File

@ -41,7 +41,7 @@ module DiscourseAi
tools: tools, tools: tools,
llms: llms, llms: llms,
settings: { settings: {
rag_pdf_images_enabled: SiteSetting.ai_rag_pdf_images_enabled, rag_images_enabled: SiteSetting.ai_rag_images_enabled,
}, },
}, },
} }

View File

@ -48,8 +48,8 @@ module DiscourseAi
def validate_extension!(filename) def validate_extension!(filename)
extension = File.extname(filename)[1..-1] || "" extension = File.extname(filename)[1..-1] || ""
authorized_extensions = %w[txt md] authorized_extensions = %w[txt md pdf]
authorized_extensions.concat(%w[pdf png jpg jpeg]) if SiteSetting.ai_rag_pdf_images_enabled authorized_extensions.concat(%w[png jpg jpeg]) if SiteSetting.ai_rag_images_enabled
if !authorized_extensions.include?(extension) if !authorized_extensions.include?(extension)
raise Discourse::InvalidParameters.new( raise Discourse::InvalidParameters.new(
I18n.t( I18n.t(

View File

@ -164,22 +164,16 @@ module ::Jobs
end end
def get_uploaded_file(upload:, target:) def get_uploaded_file(upload:, target:)
if %w[pdf png jpg jpeg].include?(upload.extension) && !SiteSetting.ai_rag_pdf_images_enabled if %w[png jpg jpeg].include?(upload.extension) && !SiteSetting.ai_rag_images_enabled
raise Discourse::InvalidAccess.new( raise Discourse::InvalidAccess.new(
"The setting ai_rag_pdf_images_enabled is false, can not index images and pdfs.", "The setting ai_rag_images_enabled is false, can not index images",
) )
end end
if upload.extension == "pdf" if upload.extension == "pdf"
pages =
DiscourseAi::Utils::PdfToImages.new(
upload: upload,
user: Discourse.system_user,
).uploaded_pages
return( return(
DiscourseAi::Utils::ImageToText.as_fake_file( DiscourseAi::Utils::PdfToText.as_fake_file(
uploads: pages, upload: upload,
llm_model: target.rag_llm_model, llm_model: SiteSetting.ai_rag_images_enabled ? target.rag_llm_model : nil,
user: Discourse.system_user, user: Discourse.system_user,
) )
) )

View File

@ -10,7 +10,7 @@ class AiCustomToolListSerializer < ApplicationSerializer
presets: AiTool.presets, presets: AiTool.presets,
llms: DiscourseAi::Configuration::LlmEnumerator.values_for_serialization, llms: DiscourseAi::Configuration::LlmEnumerator.values_for_serialization,
settings: { settings: {
rag_pdf_images_enabled: SiteSetting.ai_rag_pdf_images_enabled, rag_images_enabled: SiteSetting.ai_rag_images_enabled,
}, },
} }
end end

View File

@ -596,13 +596,13 @@ export default class PersonaEditor extends Component {
@target={{this.editingModel}} @target={{this.editingModel}}
@updateUploads={{this.updateUploads}} @updateUploads={{this.updateUploads}}
@onRemove={{this.removeUpload}} @onRemove={{this.removeUpload}}
@allowPdfsAndImages={{@personas.resultSetMeta.settings.rag_pdf_images_enabled}} @allowImages={{@personas.resultSetMeta.settings.rag_images_enabled}}
/> />
</div> </div>
<RagOptions <RagOptions
@model={{this.editingModel}} @model={{this.editingModel}}
@llms={{@personas.resultSetMeta.llms}} @llms={{@personas.resultSetMeta.llms}}
@allowPdfsAndImages={{@personas.resultSetMeta.settings.rag_pdf_images_enabled}} @allowImages={{@personas.resultSetMeta.settings.rag_images_enabled}}
> >
<div class="control-group"> <div class="control-group">
<label>{{i18n <label>{{i18n

View File

@ -245,13 +245,13 @@ export default class AiToolEditor extends Component {
@target={{this.editingModel}} @target={{this.editingModel}}
@updateUploads={{this.updateUploads}} @updateUploads={{this.updateUploads}}
@onRemove={{this.removeUpload}} @onRemove={{this.removeUpload}}
@allowPdfsAndImages={{@settings.rag_pdf_images_enabled}} @allowImages={{@settings.rag_images_enabled}}
/> />
</div> </div>
<RagOptions <RagOptions
@model={{this.editingModel}} @model={{this.editingModel}}
@llms={{@llms}} @llms={{@llms}}
@allowPdfsAndImages={{@settings.rag_pdf_images_enabled}} @allowImages={{@settings.rag_images_enabled}}
/> />
{{/if}} {{/if}}

View File

@ -81,7 +81,7 @@ export default class RagOptions extends Component {
}} }}
/> />
</div> </div>
{{#if @allowPdfsAndImages}} {{#if @allowImages}}
<div class="control-group"> <div class="control-group">
<label>{{i18n "discourse_ai.rag.options.rag_llm_model"}}</label> <label>{{i18n "discourse_ai.rag.options.rag_llm_model"}}</label>
<AiLlmSelector <AiLlmSelector

View File

@ -78,10 +78,10 @@ export default class RagUploader extends Component {
} }
get acceptedFileTypes() { get acceptedFileTypes() {
if (this.args?.allowPdfsAndImages) { if (this.args?.allowImages) {
return ".txt,.md,.pdf,.png,.jpg,.jpeg"; return ".txt,.md,.png,.jpg,.jpeg";
} else { } else {
return ".txt,.md"; return ".txt,.md,.pdf";
} }
} }
@ -127,8 +127,8 @@ export default class RagUploader extends Component {
<template> <template>
<div class="rag-uploader"> <div class="rag-uploader">
<h3>{{i18n "discourse_ai.rag.uploads.title"}}</h3> <h3>{{i18n "discourse_ai.rag.uploads.title"}}</h3>
{{#if @allowPdfsAndImages}} {{#if @allowImages}}
<p>{{i18n "discourse_ai.rag.uploads.description_with_pdfs"}}</p> <p>{{i18n "discourse_ai.rag.uploads.description_with_images"}}</p>
{{else}} {{else}}
<p>{{i18n "discourse_ai.rag.uploads.description"}}</p> <p>{{i18n "discourse_ai.rag.uploads.description"}}</p>
{{/if}} {{/if}}

View File

@ -280,8 +280,8 @@ en:
hide_indexing_options: "Hide upload options" hide_indexing_options: "Hide upload options"
uploads: uploads:
title: "Uploads" title: "Uploads"
description: "Plaintext (.txt) or markdown (.md)" description: "PDF (.pdf), Plaintext (.txt) or markdown (.md)"
description_with_pdfs: "Plaintext (.txt), markdown (.md), PDF (.pdf) or image (.png, .jpeg)" description_with_images: "Plaintext (.txt), markdown (.md), PDF (.pdf) or image (.png, .jpeg)"
button: "Add files" button: "Add files"
filter: "Filter uploads" filter: "Filter uploads"
indexed: "Indexed" indexed: "Indexed"

View File

@ -355,6 +355,6 @@ discourse_ai:
hidden: true hidden: true
type: list type: list
ai_rag_pdf_images_enabled: ai_rag_images_enabled:
default: false default: false
hidden: true hidden: true

View File

@ -130,22 +130,13 @@ class DiscourseAi::Evals::Eval
upload = upload =
UploadCreator.new(File.open(path), File.basename(path)).create_for(Discourse.system_user.id) UploadCreator.new(File.open(path), File.basename(path)).create_for(Discourse.system_user.id)
uploads =
DiscourseAi::Utils::PdfToImages.new(
upload: upload,
user: Discourse.system_user,
).uploaded_pages
text = +"" text = +""
uploads.each do |page_upload| DiscourseAi::Utils::PdfToText
DiscourseAi::Utils::ImageToText .new(upload: upload, user: Discourse.system_user, llm_model: llm.llm_model)
.new(upload: page_upload, llm_model: llm.llm_model, user: Discourse.system_user) .extract_text do |chunk|
.extract_text do |chunk, error| text << chunk if chunk
text << chunk if chunk text << "\n\n" if chunk
text << "\n\n" if chunk end
end
upload.destroy
end
text text
ensure ensure

View File

@ -50,12 +50,27 @@ class DiscourseAi::Utils::ImageToText
Reader.new(uploads: uploads, llm_model: llm_model, user: user) Reader.new(uploads: uploads, llm_model: llm_model, user: user)
end end
def self.tesseract_installed?
if defined?(@tesseract_installed)
@tesseract_installed
else
@tesseract_installed =
begin
Discourse::Utils.execute_command("which", "tesseract")
true
rescue Discourse::Utils::CommandError
false
end
end
end
attr_reader :upload, :llm_model, :user attr_reader :upload, :llm_model, :user
def initialize(upload:, llm_model:, user:) def initialize(upload:, llm_model:, user:, guidance_text: nil)
@upload = upload @upload = upload
@llm_model = llm_model @llm_model = llm_model
@user = user @user = user
@guidance_text = guidance_text
end end
def extract_text(retries: 3) def extract_text(retries: 3)
@ -104,7 +119,8 @@ class DiscourseAi::Utils::ImageToText
end end
def extract_text_from_page(page) def extract_text_from_page(page)
raw_text = extract_text_with_tesseract(page) raw_text = @guidance_text
raw_text ||= extract_text_with_tesseract(page) if self.class.tesseract_installed?
llm = llm_model.to_llm llm = llm_model.to_llm
if raw_text.present? if raw_text.present?
@ -112,7 +128,7 @@ class DiscourseAi::Utils::ImageToText
{ {
type: :user, type: :user,
content: content:
"The following text was extracted from an image using OCR. Please enhance, correct, and structure this content while maintaining the original meaning:\n\n#{raw_text}", "The following text was extracted from an image using OCR. Please enhance, correct, and structure this content while maintaining the original text:\n\n#{raw_text}",
upload_ids: [page.id], upload_ids: [page.id],
}, },
] ]
@ -127,6 +143,8 @@ class DiscourseAi::Utils::ImageToText
end end
def extract_text_with_tesseract(page) def extract_text_with_tesseract(page)
# return nil if we can not find tessaract binary
return nil if !self.class.tesseract_installed?
upload_path = upload_path =
if page.local? if page.local?
Discourse.store.path_for(page) Discourse.store.path_for(page)

View File

@ -19,8 +19,6 @@ class DiscourseAi::Utils::PdfToImages
end end
def extract_pages def extract_pages
Dir.mktmpdir("discourse-pdf-#{SecureRandom.hex(8)}")
begin begin
pdf_path = pdf_path =
if upload.local? if upload.local?
@ -31,6 +29,7 @@ class DiscourseAi::Utils::PdfToImages
raise Discourse::InvalidParameters.new("Failed to download PDF") if pdf_path.nil? raise Discourse::InvalidParameters.new("Failed to download PDF") if pdf_path.nil?
temp_dir = Dir.mktmpdir("discourse-pdf-#{SecureRandom.hex(8)}")
temp_pdf = File.join(temp_dir, "source.pdf") temp_pdf = File.join(temp_dir, "source.pdf")
FileUtils.cp(pdf_path, temp_pdf) FileUtils.cp(pdf_path, temp_pdf)
@ -74,7 +73,7 @@ class DiscourseAi::Utils::PdfToImages
@uploaded_pages = uploads @uploaded_pages = uploads
ensure ensure
FileUtils.rm_rf(temp_dir) FileUtils.rm_rf(temp_dir) if temp_dir
end end
end end
end end

112
lib/utils/pdf_to_text.rb Normal file
View File

@ -0,0 +1,112 @@
# frozen_string_literal: true
class DiscourseAi::Utils::PdfToText
MAX_PDF_SIZE = 100.megabytes
class Reader
def initialize(upload:, user: nil, llm_model: nil)
@extractor =
DiscourseAi::Utils::PdfToText.new(upload: upload, user: user, llm_model: llm_model)
@enumerator = create_enumerator
@buffer = +""
end
def read(length)
return @buffer.slice!(0, length) if !@buffer.empty?
begin
@buffer << @enumerator.next
rescue StopIteration
return nil
end
@buffer.slice!(0, length)
end
private
def create_enumerator
Enumerator.new { |yielder| @extractor.extract_text { |chunk| yielder.yield(chunk || "") } }
end
end
attr_reader :upload
def self.as_fake_file(upload:, user: nil, llm_model: nil)
Reader.new(upload: upload, user: user, llm_model: llm_model)
end
def initialize(upload:, user: nil, llm_model: nil)
@upload = upload
@user = user
@llm_model = llm_model
end
def extract_text
pdf_path =
if upload.local?
Discourse.store.path_for(upload)
else
Discourse.store.download_safe(upload, max_file_size_kb: MAX_PDF_SIZE)&.path
end
raise Discourse::InvalidParameters.new("Failed to download PDF") if pdf_path.nil?
require "pdf/reader"
page_number = 0
PDF::Reader.open(pdf_path) do |reader|
reader.pages.each do |page|
page_number += 1
llm_decorate(page_number: page_number, text: page.text, pdf_path: pdf_path) do |chunk|
yield chunk
end
end
end
end
def llm_decorate(page_number:, text:, pdf_path:)
raise "Must be called with block" if !block_given?
if !@llm_model
yield text
return
end
begin
temp_dir = Dir.mktmpdir("discourse-pdf-#{SecureRandom.hex(8)}")
output_path = File.join(temp_dir, "page-#{page_number}.png")
# Extract specific page using ImageMagick
# image magick uses 0 based page numbers
command = [
"magick",
"-density",
"300",
"#{pdf_path}[#{page_number - 1}]",
"-background",
"white",
"-auto-orient",
"-quality",
"85",
output_path,
]
Discourse::Utils.execute_command(
*command,
failure_message: "Failed to convert PDF page #{page_number} to image",
timeout: 30,
)
# TODO - we are creating leftover uploads, they will be cleaned up
# but maybe we should just keep them around?
upload =
UploadCreator.new(File.open(output_path), "page-#{page_number}.png").create_for(@user&.id)
DiscourseAi::Utils::ImageToText
.new(upload: upload, llm_model: @llm_model, user: @user, guidance_text: text)
.extract_text { |chunk| yield chunk }
ensure
FileUtils.rm_rf(temp_dir) if temp_dir
end
end
end

View File

@ -12,6 +12,16 @@ gem "tokenizers", "0.4.4"
gem "tiktoken_ruby", "0.0.9" gem "tiktoken_ruby", "0.0.9"
gem "ed25519", "1.2.4" #TODO remove this as existing ssl gem should handle this gem "ed25519", "1.2.4" #TODO remove this as existing ssl gem should handle this
# we probably want to move all dependencies directly in to the Discourse Gemfile, this
# will give us a strong guarantee that the dependencies are compatible and keep getting upgraded
gem "Ascii85", "2.0.1", require: false
gem "ruby-rc4", "0.1.5", require: false
gem "hashery", "2.1.2", require: false
gem "ttfunk", "1.8.0", require: false
gem "afm", "0.2.2", require: false
# all above are required by pdf-reader
gem "pdf-reader", "2.14.1", require: false
enabled_site_setting :discourse_ai_enabled enabled_site_setting :discourse_ai_enabled
register_asset "stylesheets/common/streaming.scss" register_asset "stylesheets/common/streaming.scss"

BIN
spec/fixtures/rag/2-page.pdf vendored Normal file

Binary file not shown.

View File

@ -3,7 +3,7 @@
RSpec.describe Jobs::DigestRagUpload do RSpec.describe Jobs::DigestRagUpload do
fab!(:persona) { Fabricate(:ai_persona) } fab!(:persona) { Fabricate(:ai_persona) }
fab!(:upload) { Fabricate(:upload, extension: "txt") } fab!(:upload) { Fabricate(:upload, extension: "txt") }
fab!(:pdf_upload) { Fabricate(:upload, extension: "pdf") } fab!(:image_upload) { Fabricate(:upload, extension: "png") }
let(:document_file) { StringIO.new("some text" * 200) } let(:document_file) { StringIO.new("some text" * 200) }
fab!(:cloudflare_embedding_def) fab!(:cloudflare_embedding_def)
@ -31,13 +31,13 @@ RSpec.describe Jobs::DigestRagUpload do
end end
describe "#execute" do describe "#execute" do
context "when processing a PDF upload" do context "when processing an image upload" do
it "will reject the indexing if the site setting is not enabled" do it "will reject the indexing if the site setting is not enabled" do
SiteSetting.ai_rag_pdf_images_enabled = false SiteSetting.ai_rag_images_enabled = false
expect { expect {
described_class.new.execute( described_class.new.execute(
upload_id: pdf_upload.id, upload_id: image_upload.id,
target_id: persona.id, target_id: persona.id,
target_type: persona.class.to_s, target_type: persona.class.to_s,
) )

View File

@ -0,0 +1,62 @@
# frozen_string_literal: true
RSpec.describe DiscourseAi::Utils::PdfToText do
fab!(:llm_model)
fab!(:user)
let(:pdf) { plugin_file_from_fixtures("2-page.pdf", "rag") }
let(:upload) { UploadCreator.new(pdf, "2-page.pdf").create_for(Discourse.system_user.id) }
before { SiteSetting.authorized_extensions = "pdf|png|jpg|jpeg" }
describe "#extract_text" do
it "extracts text from PDF pages" do
pdf_to_text = described_class.new(upload: upload)
pages = []
pdf_to_text.extract_text { |page| pages << page }
expect(pages).to eq(["Page 1", "Page 2"])
end
end
context "when improving PDF extraction with LLM" do
it "can properly simulate a file" do
if ENV["CI"]
skip "This test requires imagemagick is installed with ghostscript support - which is not available in CI"
end
responses = [
"<chunk>Page 1: LLM chunk 1</chunk><chunk>Page 1: LLM chunk 2</chunk>",
"<chunk>Page 2: LLM chunk 3</chunk>",
]
pages = []
DiscourseAi::Completions::Llm.with_prepared_responses(responses) do |_, _, _prompts|
file = described_class.as_fake_file(upload: upload, user: user, llm_model: llm_model)
while content = file.read(100_000)
pages << content
end
end
expect(pages).to eq(["Page 1: LLM chunk 1", "Page 1: LLM chunk 2", "Page 2: LLM chunk 3"])
end
it "works as expected" do
if ENV["CI"]
skip "This test requires imagemagick is installed with ghostscript support - which is not available in CI"
end
pdf_to_text = described_class.new(upload: upload, user: user, llm_model: llm_model)
pages = []
responses = [
"<chunk>Page 1: LLM chunk 1</chunk><chunk>Page 1: LLM chunk 2</chunk>",
"<chunk>Page 2: LLM chunk 3</chunk>",
]
DiscourseAi::Completions::Llm.with_prepared_responses(responses) do |_, _, _prompts|
pdf_to_text.extract_text { |page| pages << page }
end
expect(pages).to eq(["Page 1: LLM chunk 1", "Page 1: LLM chunk 2", "Page 2: LLM chunk 3"])
end
end
end

View File

@ -24,10 +24,10 @@ RSpec.describe DiscourseAi::Admin::RagDocumentFragmentsController do
end end
describe "POST #upload_file" do describe "POST #upload_file" do
let :fake_pdf do let :fake_image do
@cleanup_files ||= [] @cleanup_files ||= []
tempfile = Tempfile.new(%w[test .pdf]) tempfile = Tempfile.new(%w[test .png])
tempfile.write("fake pdf") tempfile.write("fake image")
tempfile.rewind tempfile.rewind
@cleanup_files << tempfile @cleanup_files << tempfile
tempfile tempfile
@ -46,26 +46,26 @@ RSpec.describe DiscourseAi::Admin::RagDocumentFragmentsController do
end end
it "rejects PDF files if site setting is not enabled" do it "rejects PDF files if site setting is not enabled" do
SiteSetting.ai_rag_pdf_images_enabled = false SiteSetting.ai_rag_images_enabled = false
post "/admin/plugins/discourse-ai/rag-document-fragments/files/upload.json", post "/admin/plugins/discourse-ai/rag-document-fragments/files/upload.json",
params: { params: {
file: Rack::Test::UploadedFile.new(fake_pdf), file: Rack::Test::UploadedFile.new(fake_image),
} }
expect(response.status).to eq(400) expect(response.status).to eq(400)
end end
it "allows PDF files if site setting is enabled" do it "allows image files if site setting is enabled" do
SiteSetting.ai_rag_pdf_images_enabled = true SiteSetting.ai_rag_images_enabled = true
post "/admin/plugins/discourse-ai/rag-document-fragments/files/upload.json", post "/admin/plugins/discourse-ai/rag-document-fragments/files/upload.json",
params: { params: {
file: Rack::Test::UploadedFile.new(fake_pdf), file: Rack::Test::UploadedFile.new(fake_image),
} }
upload = Upload.last upload = Upload.last
expect(upload.original_filename).to end_with(".pdf") expect(upload.original_filename).to end_with(".png")
end end
end end
end end