FEATURE: Native PDF support (#1127)
* FEATURE: Native PDF support This amends it so we use PDF Reader gem to extract text from PDFs * This means that our simple pdf eval passes at last * fix spec * skip test in CI * test file support * Update lib/utils/image_to_text.rb Co-authored-by: Alan Guo Xiang Tan <gxtan1990@gmail.com> * address pr comments --------- Co-authored-by: Alan Guo Xiang Tan <gxtan1990@gmail.com>
This commit is contained in:
parent
9a6aec2cf6
commit
ce79a18790
|
@ -41,7 +41,7 @@ module DiscourseAi
|
||||||
tools: tools,
|
tools: tools,
|
||||||
llms: llms,
|
llms: llms,
|
||||||
settings: {
|
settings: {
|
||||||
rag_pdf_images_enabled: SiteSetting.ai_rag_pdf_images_enabled,
|
rag_images_enabled: SiteSetting.ai_rag_images_enabled,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -48,8 +48,8 @@ module DiscourseAi
|
||||||
|
|
||||||
def validate_extension!(filename)
|
def validate_extension!(filename)
|
||||||
extension = File.extname(filename)[1..-1] || ""
|
extension = File.extname(filename)[1..-1] || ""
|
||||||
authorized_extensions = %w[txt md]
|
authorized_extensions = %w[txt md pdf]
|
||||||
authorized_extensions.concat(%w[pdf png jpg jpeg]) if SiteSetting.ai_rag_pdf_images_enabled
|
authorized_extensions.concat(%w[png jpg jpeg]) if SiteSetting.ai_rag_images_enabled
|
||||||
if !authorized_extensions.include?(extension)
|
if !authorized_extensions.include?(extension)
|
||||||
raise Discourse::InvalidParameters.new(
|
raise Discourse::InvalidParameters.new(
|
||||||
I18n.t(
|
I18n.t(
|
||||||
|
|
|
@ -164,22 +164,16 @@ module ::Jobs
|
||||||
end
|
end
|
||||||
|
|
||||||
def get_uploaded_file(upload:, target:)
|
def get_uploaded_file(upload:, target:)
|
||||||
if %w[pdf png jpg jpeg].include?(upload.extension) && !SiteSetting.ai_rag_pdf_images_enabled
|
if %w[png jpg jpeg].include?(upload.extension) && !SiteSetting.ai_rag_images_enabled
|
||||||
raise Discourse::InvalidAccess.new(
|
raise Discourse::InvalidAccess.new(
|
||||||
"The setting ai_rag_pdf_images_enabled is false, can not index images and pdfs.",
|
"The setting ai_rag_images_enabled is false, can not index images",
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
if upload.extension == "pdf"
|
if upload.extension == "pdf"
|
||||||
pages =
|
|
||||||
DiscourseAi::Utils::PdfToImages.new(
|
|
||||||
upload: upload,
|
|
||||||
user: Discourse.system_user,
|
|
||||||
).uploaded_pages
|
|
||||||
|
|
||||||
return(
|
return(
|
||||||
DiscourseAi::Utils::ImageToText.as_fake_file(
|
DiscourseAi::Utils::PdfToText.as_fake_file(
|
||||||
uploads: pages,
|
upload: upload,
|
||||||
llm_model: target.rag_llm_model,
|
llm_model: SiteSetting.ai_rag_images_enabled ? target.rag_llm_model : nil,
|
||||||
user: Discourse.system_user,
|
user: Discourse.system_user,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
|
@ -10,7 +10,7 @@ class AiCustomToolListSerializer < ApplicationSerializer
|
||||||
presets: AiTool.presets,
|
presets: AiTool.presets,
|
||||||
llms: DiscourseAi::Configuration::LlmEnumerator.values_for_serialization,
|
llms: DiscourseAi::Configuration::LlmEnumerator.values_for_serialization,
|
||||||
settings: {
|
settings: {
|
||||||
rag_pdf_images_enabled: SiteSetting.ai_rag_pdf_images_enabled,
|
rag_images_enabled: SiteSetting.ai_rag_images_enabled,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
end
|
end
|
||||||
|
|
|
@ -596,13 +596,13 @@ export default class PersonaEditor extends Component {
|
||||||
@target={{this.editingModel}}
|
@target={{this.editingModel}}
|
||||||
@updateUploads={{this.updateUploads}}
|
@updateUploads={{this.updateUploads}}
|
||||||
@onRemove={{this.removeUpload}}
|
@onRemove={{this.removeUpload}}
|
||||||
@allowPdfsAndImages={{@personas.resultSetMeta.settings.rag_pdf_images_enabled}}
|
@allowImages={{@personas.resultSetMeta.settings.rag_images_enabled}}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
<RagOptions
|
<RagOptions
|
||||||
@model={{this.editingModel}}
|
@model={{this.editingModel}}
|
||||||
@llms={{@personas.resultSetMeta.llms}}
|
@llms={{@personas.resultSetMeta.llms}}
|
||||||
@allowPdfsAndImages={{@personas.resultSetMeta.settings.rag_pdf_images_enabled}}
|
@allowImages={{@personas.resultSetMeta.settings.rag_images_enabled}}
|
||||||
>
|
>
|
||||||
<div class="control-group">
|
<div class="control-group">
|
||||||
<label>{{i18n
|
<label>{{i18n
|
||||||
|
|
|
@ -245,13 +245,13 @@ export default class AiToolEditor extends Component {
|
||||||
@target={{this.editingModel}}
|
@target={{this.editingModel}}
|
||||||
@updateUploads={{this.updateUploads}}
|
@updateUploads={{this.updateUploads}}
|
||||||
@onRemove={{this.removeUpload}}
|
@onRemove={{this.removeUpload}}
|
||||||
@allowPdfsAndImages={{@settings.rag_pdf_images_enabled}}
|
@allowImages={{@settings.rag_images_enabled}}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
<RagOptions
|
<RagOptions
|
||||||
@model={{this.editingModel}}
|
@model={{this.editingModel}}
|
||||||
@llms={{@llms}}
|
@llms={{@llms}}
|
||||||
@allowPdfsAndImages={{@settings.rag_pdf_images_enabled}}
|
@allowImages={{@settings.rag_images_enabled}}
|
||||||
/>
|
/>
|
||||||
{{/if}}
|
{{/if}}
|
||||||
|
|
||||||
|
|
|
@ -81,7 +81,7 @@ export default class RagOptions extends Component {
|
||||||
}}
|
}}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
{{#if @allowPdfsAndImages}}
|
{{#if @allowImages}}
|
||||||
<div class="control-group">
|
<div class="control-group">
|
||||||
<label>{{i18n "discourse_ai.rag.options.rag_llm_model"}}</label>
|
<label>{{i18n "discourse_ai.rag.options.rag_llm_model"}}</label>
|
||||||
<AiLlmSelector
|
<AiLlmSelector
|
||||||
|
|
|
@ -78,10 +78,10 @@ export default class RagUploader extends Component {
|
||||||
}
|
}
|
||||||
|
|
||||||
get acceptedFileTypes() {
|
get acceptedFileTypes() {
|
||||||
if (this.args?.allowPdfsAndImages) {
|
if (this.args?.allowImages) {
|
||||||
return ".txt,.md,.pdf,.png,.jpg,.jpeg";
|
return ".txt,.md,.png,.jpg,.jpeg";
|
||||||
} else {
|
} else {
|
||||||
return ".txt,.md";
|
return ".txt,.md,.pdf";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -127,8 +127,8 @@ export default class RagUploader extends Component {
|
||||||
<template>
|
<template>
|
||||||
<div class="rag-uploader">
|
<div class="rag-uploader">
|
||||||
<h3>{{i18n "discourse_ai.rag.uploads.title"}}</h3>
|
<h3>{{i18n "discourse_ai.rag.uploads.title"}}</h3>
|
||||||
{{#if @allowPdfsAndImages}}
|
{{#if @allowImages}}
|
||||||
<p>{{i18n "discourse_ai.rag.uploads.description_with_pdfs"}}</p>
|
<p>{{i18n "discourse_ai.rag.uploads.description_with_images"}}</p>
|
||||||
{{else}}
|
{{else}}
|
||||||
<p>{{i18n "discourse_ai.rag.uploads.description"}}</p>
|
<p>{{i18n "discourse_ai.rag.uploads.description"}}</p>
|
||||||
{{/if}}
|
{{/if}}
|
||||||
|
|
|
@ -280,8 +280,8 @@ en:
|
||||||
hide_indexing_options: "Hide upload options"
|
hide_indexing_options: "Hide upload options"
|
||||||
uploads:
|
uploads:
|
||||||
title: "Uploads"
|
title: "Uploads"
|
||||||
description: "Plaintext (.txt) or markdown (.md)"
|
description: "PDF (.pdf), Plaintext (.txt) or markdown (.md)"
|
||||||
description_with_pdfs: "Plaintext (.txt), markdown (.md), PDF (.pdf) or image (.png, .jpeg)"
|
description_with_images: "Plaintext (.txt), markdown (.md), PDF (.pdf) or image (.png, .jpeg)"
|
||||||
button: "Add files"
|
button: "Add files"
|
||||||
filter: "Filter uploads"
|
filter: "Filter uploads"
|
||||||
indexed: "Indexed"
|
indexed: "Indexed"
|
||||||
|
|
|
@ -355,6 +355,6 @@ discourse_ai:
|
||||||
hidden: true
|
hidden: true
|
||||||
type: list
|
type: list
|
||||||
|
|
||||||
ai_rag_pdf_images_enabled:
|
ai_rag_images_enabled:
|
||||||
default: false
|
default: false
|
||||||
hidden: true
|
hidden: true
|
||||||
|
|
|
@ -130,22 +130,13 @@ class DiscourseAi::Evals::Eval
|
||||||
upload =
|
upload =
|
||||||
UploadCreator.new(File.open(path), File.basename(path)).create_for(Discourse.system_user.id)
|
UploadCreator.new(File.open(path), File.basename(path)).create_for(Discourse.system_user.id)
|
||||||
|
|
||||||
uploads =
|
|
||||||
DiscourseAi::Utils::PdfToImages.new(
|
|
||||||
upload: upload,
|
|
||||||
user: Discourse.system_user,
|
|
||||||
).uploaded_pages
|
|
||||||
|
|
||||||
text = +""
|
text = +""
|
||||||
uploads.each do |page_upload|
|
DiscourseAi::Utils::PdfToText
|
||||||
DiscourseAi::Utils::ImageToText
|
.new(upload: upload, user: Discourse.system_user, llm_model: llm.llm_model)
|
||||||
.new(upload: page_upload, llm_model: llm.llm_model, user: Discourse.system_user)
|
.extract_text do |chunk|
|
||||||
.extract_text do |chunk, error|
|
text << chunk if chunk
|
||||||
text << chunk if chunk
|
text << "\n\n" if chunk
|
||||||
text << "\n\n" if chunk
|
end
|
||||||
end
|
|
||||||
upload.destroy
|
|
||||||
end
|
|
||||||
|
|
||||||
text
|
text
|
||||||
ensure
|
ensure
|
||||||
|
|
|
@ -50,12 +50,27 @@ class DiscourseAi::Utils::ImageToText
|
||||||
Reader.new(uploads: uploads, llm_model: llm_model, user: user)
|
Reader.new(uploads: uploads, llm_model: llm_model, user: user)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def self.tesseract_installed?
|
||||||
|
if defined?(@tesseract_installed)
|
||||||
|
@tesseract_installed
|
||||||
|
else
|
||||||
|
@tesseract_installed =
|
||||||
|
begin
|
||||||
|
Discourse::Utils.execute_command("which", "tesseract")
|
||||||
|
true
|
||||||
|
rescue Discourse::Utils::CommandError
|
||||||
|
false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
attr_reader :upload, :llm_model, :user
|
attr_reader :upload, :llm_model, :user
|
||||||
|
|
||||||
def initialize(upload:, llm_model:, user:)
|
def initialize(upload:, llm_model:, user:, guidance_text: nil)
|
||||||
@upload = upload
|
@upload = upload
|
||||||
@llm_model = llm_model
|
@llm_model = llm_model
|
||||||
@user = user
|
@user = user
|
||||||
|
@guidance_text = guidance_text
|
||||||
end
|
end
|
||||||
|
|
||||||
def extract_text(retries: 3)
|
def extract_text(retries: 3)
|
||||||
|
@ -104,7 +119,8 @@ class DiscourseAi::Utils::ImageToText
|
||||||
end
|
end
|
||||||
|
|
||||||
def extract_text_from_page(page)
|
def extract_text_from_page(page)
|
||||||
raw_text = extract_text_with_tesseract(page)
|
raw_text = @guidance_text
|
||||||
|
raw_text ||= extract_text_with_tesseract(page) if self.class.tesseract_installed?
|
||||||
|
|
||||||
llm = llm_model.to_llm
|
llm = llm_model.to_llm
|
||||||
if raw_text.present?
|
if raw_text.present?
|
||||||
|
@ -112,7 +128,7 @@ class DiscourseAi::Utils::ImageToText
|
||||||
{
|
{
|
||||||
type: :user,
|
type: :user,
|
||||||
content:
|
content:
|
||||||
"The following text was extracted from an image using OCR. Please enhance, correct, and structure this content while maintaining the original meaning:\n\n#{raw_text}",
|
"The following text was extracted from an image using OCR. Please enhance, correct, and structure this content while maintaining the original text:\n\n#{raw_text}",
|
||||||
upload_ids: [page.id],
|
upload_ids: [page.id],
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
@ -127,6 +143,8 @@ class DiscourseAi::Utils::ImageToText
|
||||||
end
|
end
|
||||||
|
|
||||||
def extract_text_with_tesseract(page)
|
def extract_text_with_tesseract(page)
|
||||||
|
# return nil if we can not find tessaract binary
|
||||||
|
return nil if !self.class.tesseract_installed?
|
||||||
upload_path =
|
upload_path =
|
||||||
if page.local?
|
if page.local?
|
||||||
Discourse.store.path_for(page)
|
Discourse.store.path_for(page)
|
||||||
|
|
|
@ -19,8 +19,6 @@ class DiscourseAi::Utils::PdfToImages
|
||||||
end
|
end
|
||||||
|
|
||||||
def extract_pages
|
def extract_pages
|
||||||
Dir.mktmpdir("discourse-pdf-#{SecureRandom.hex(8)}")
|
|
||||||
|
|
||||||
begin
|
begin
|
||||||
pdf_path =
|
pdf_path =
|
||||||
if upload.local?
|
if upload.local?
|
||||||
|
@ -31,6 +29,7 @@ class DiscourseAi::Utils::PdfToImages
|
||||||
|
|
||||||
raise Discourse::InvalidParameters.new("Failed to download PDF") if pdf_path.nil?
|
raise Discourse::InvalidParameters.new("Failed to download PDF") if pdf_path.nil?
|
||||||
|
|
||||||
|
temp_dir = Dir.mktmpdir("discourse-pdf-#{SecureRandom.hex(8)}")
|
||||||
temp_pdf = File.join(temp_dir, "source.pdf")
|
temp_pdf = File.join(temp_dir, "source.pdf")
|
||||||
FileUtils.cp(pdf_path, temp_pdf)
|
FileUtils.cp(pdf_path, temp_pdf)
|
||||||
|
|
||||||
|
@ -74,7 +73,7 @@ class DiscourseAi::Utils::PdfToImages
|
||||||
|
|
||||||
@uploaded_pages = uploads
|
@uploaded_pages = uploads
|
||||||
ensure
|
ensure
|
||||||
FileUtils.rm_rf(temp_dir)
|
FileUtils.rm_rf(temp_dir) if temp_dir
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -0,0 +1,112 @@
|
||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
class DiscourseAi::Utils::PdfToText
|
||||||
|
MAX_PDF_SIZE = 100.megabytes
|
||||||
|
|
||||||
|
class Reader
|
||||||
|
def initialize(upload:, user: nil, llm_model: nil)
|
||||||
|
@extractor =
|
||||||
|
DiscourseAi::Utils::PdfToText.new(upload: upload, user: user, llm_model: llm_model)
|
||||||
|
@enumerator = create_enumerator
|
||||||
|
@buffer = +""
|
||||||
|
end
|
||||||
|
|
||||||
|
def read(length)
|
||||||
|
return @buffer.slice!(0, length) if !@buffer.empty?
|
||||||
|
|
||||||
|
begin
|
||||||
|
@buffer << @enumerator.next
|
||||||
|
rescue StopIteration
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
|
||||||
|
@buffer.slice!(0, length)
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def create_enumerator
|
||||||
|
Enumerator.new { |yielder| @extractor.extract_text { |chunk| yielder.yield(chunk || "") } }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
attr_reader :upload
|
||||||
|
|
||||||
|
def self.as_fake_file(upload:, user: nil, llm_model: nil)
|
||||||
|
Reader.new(upload: upload, user: user, llm_model: llm_model)
|
||||||
|
end
|
||||||
|
|
||||||
|
def initialize(upload:, user: nil, llm_model: nil)
|
||||||
|
@upload = upload
|
||||||
|
@user = user
|
||||||
|
@llm_model = llm_model
|
||||||
|
end
|
||||||
|
|
||||||
|
def extract_text
|
||||||
|
pdf_path =
|
||||||
|
if upload.local?
|
||||||
|
Discourse.store.path_for(upload)
|
||||||
|
else
|
||||||
|
Discourse.store.download_safe(upload, max_file_size_kb: MAX_PDF_SIZE)&.path
|
||||||
|
end
|
||||||
|
|
||||||
|
raise Discourse::InvalidParameters.new("Failed to download PDF") if pdf_path.nil?
|
||||||
|
|
||||||
|
require "pdf/reader"
|
||||||
|
|
||||||
|
page_number = 0
|
||||||
|
PDF::Reader.open(pdf_path) do |reader|
|
||||||
|
reader.pages.each do |page|
|
||||||
|
page_number += 1
|
||||||
|
llm_decorate(page_number: page_number, text: page.text, pdf_path: pdf_path) do |chunk|
|
||||||
|
yield chunk
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def llm_decorate(page_number:, text:, pdf_path:)
|
||||||
|
raise "Must be called with block" if !block_given?
|
||||||
|
if !@llm_model
|
||||||
|
yield text
|
||||||
|
return
|
||||||
|
end
|
||||||
|
|
||||||
|
begin
|
||||||
|
temp_dir = Dir.mktmpdir("discourse-pdf-#{SecureRandom.hex(8)}")
|
||||||
|
output_path = File.join(temp_dir, "page-#{page_number}.png")
|
||||||
|
|
||||||
|
# Extract specific page using ImageMagick
|
||||||
|
# image magick uses 0 based page numbers
|
||||||
|
command = [
|
||||||
|
"magick",
|
||||||
|
"-density",
|
||||||
|
"300",
|
||||||
|
"#{pdf_path}[#{page_number - 1}]",
|
||||||
|
"-background",
|
||||||
|
"white",
|
||||||
|
"-auto-orient",
|
||||||
|
"-quality",
|
||||||
|
"85",
|
||||||
|
output_path,
|
||||||
|
]
|
||||||
|
|
||||||
|
Discourse::Utils.execute_command(
|
||||||
|
*command,
|
||||||
|
failure_message: "Failed to convert PDF page #{page_number} to image",
|
||||||
|
timeout: 30,
|
||||||
|
)
|
||||||
|
|
||||||
|
# TODO - we are creating leftover uploads, they will be cleaned up
|
||||||
|
# but maybe we should just keep them around?
|
||||||
|
upload =
|
||||||
|
UploadCreator.new(File.open(output_path), "page-#{page_number}.png").create_for(@user&.id)
|
||||||
|
|
||||||
|
DiscourseAi::Utils::ImageToText
|
||||||
|
.new(upload: upload, llm_model: @llm_model, user: @user, guidance_text: text)
|
||||||
|
.extract_text { |chunk| yield chunk }
|
||||||
|
ensure
|
||||||
|
FileUtils.rm_rf(temp_dir) if temp_dir
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
10
plugin.rb
10
plugin.rb
|
@ -12,6 +12,16 @@ gem "tokenizers", "0.4.4"
|
||||||
gem "tiktoken_ruby", "0.0.9"
|
gem "tiktoken_ruby", "0.0.9"
|
||||||
gem "ed25519", "1.2.4" #TODO remove this as existing ssl gem should handle this
|
gem "ed25519", "1.2.4" #TODO remove this as existing ssl gem should handle this
|
||||||
|
|
||||||
|
# we probably want to move all dependencies directly in to the Discourse Gemfile, this
|
||||||
|
# will give us a strong guarantee that the dependencies are compatible and keep getting upgraded
|
||||||
|
gem "Ascii85", "2.0.1", require: false
|
||||||
|
gem "ruby-rc4", "0.1.5", require: false
|
||||||
|
gem "hashery", "2.1.2", require: false
|
||||||
|
gem "ttfunk", "1.8.0", require: false
|
||||||
|
gem "afm", "0.2.2", require: false
|
||||||
|
# all above are required by pdf-reader
|
||||||
|
gem "pdf-reader", "2.14.1", require: false
|
||||||
|
|
||||||
enabled_site_setting :discourse_ai_enabled
|
enabled_site_setting :discourse_ai_enabled
|
||||||
|
|
||||||
register_asset "stylesheets/common/streaming.scss"
|
register_asset "stylesheets/common/streaming.scss"
|
||||||
|
|
Binary file not shown.
|
@ -3,7 +3,7 @@
|
||||||
RSpec.describe Jobs::DigestRagUpload do
|
RSpec.describe Jobs::DigestRagUpload do
|
||||||
fab!(:persona) { Fabricate(:ai_persona) }
|
fab!(:persona) { Fabricate(:ai_persona) }
|
||||||
fab!(:upload) { Fabricate(:upload, extension: "txt") }
|
fab!(:upload) { Fabricate(:upload, extension: "txt") }
|
||||||
fab!(:pdf_upload) { Fabricate(:upload, extension: "pdf") }
|
fab!(:image_upload) { Fabricate(:upload, extension: "png") }
|
||||||
let(:document_file) { StringIO.new("some text" * 200) }
|
let(:document_file) { StringIO.new("some text" * 200) }
|
||||||
|
|
||||||
fab!(:cloudflare_embedding_def)
|
fab!(:cloudflare_embedding_def)
|
||||||
|
@ -31,13 +31,13 @@ RSpec.describe Jobs::DigestRagUpload do
|
||||||
end
|
end
|
||||||
|
|
||||||
describe "#execute" do
|
describe "#execute" do
|
||||||
context "when processing a PDF upload" do
|
context "when processing an image upload" do
|
||||||
it "will reject the indexing if the site setting is not enabled" do
|
it "will reject the indexing if the site setting is not enabled" do
|
||||||
SiteSetting.ai_rag_pdf_images_enabled = false
|
SiteSetting.ai_rag_images_enabled = false
|
||||||
|
|
||||||
expect {
|
expect {
|
||||||
described_class.new.execute(
|
described_class.new.execute(
|
||||||
upload_id: pdf_upload.id,
|
upload_id: image_upload.id,
|
||||||
target_id: persona.id,
|
target_id: persona.id,
|
||||||
target_type: persona.class.to_s,
|
target_type: persona.class.to_s,
|
||||||
)
|
)
|
||||||
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
RSpec.describe DiscourseAi::Utils::PdfToText do
|
||||||
|
fab!(:llm_model)
|
||||||
|
fab!(:user)
|
||||||
|
let(:pdf) { plugin_file_from_fixtures("2-page.pdf", "rag") }
|
||||||
|
let(:upload) { UploadCreator.new(pdf, "2-page.pdf").create_for(Discourse.system_user.id) }
|
||||||
|
before { SiteSetting.authorized_extensions = "pdf|png|jpg|jpeg" }
|
||||||
|
|
||||||
|
describe "#extract_text" do
|
||||||
|
it "extracts text from PDF pages" do
|
||||||
|
pdf_to_text = described_class.new(upload: upload)
|
||||||
|
pages = []
|
||||||
|
pdf_to_text.extract_text { |page| pages << page }
|
||||||
|
|
||||||
|
expect(pages).to eq(["Page 1", "Page 2"])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context "when improving PDF extraction with LLM" do
|
||||||
|
it "can properly simulate a file" do
|
||||||
|
if ENV["CI"]
|
||||||
|
skip "This test requires imagemagick is installed with ghostscript support - which is not available in CI"
|
||||||
|
end
|
||||||
|
|
||||||
|
responses = [
|
||||||
|
"<chunk>Page 1: LLM chunk 1</chunk><chunk>Page 1: LLM chunk 2</chunk>",
|
||||||
|
"<chunk>Page 2: LLM chunk 3</chunk>",
|
||||||
|
]
|
||||||
|
|
||||||
|
pages = []
|
||||||
|
DiscourseAi::Completions::Llm.with_prepared_responses(responses) do |_, _, _prompts|
|
||||||
|
file = described_class.as_fake_file(upload: upload, user: user, llm_model: llm_model)
|
||||||
|
|
||||||
|
while content = file.read(100_000)
|
||||||
|
pages << content
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
expect(pages).to eq(["Page 1: LLM chunk 1", "Page 1: LLM chunk 2", "Page 2: LLM chunk 3"])
|
||||||
|
end
|
||||||
|
|
||||||
|
it "works as expected" do
|
||||||
|
if ENV["CI"]
|
||||||
|
skip "This test requires imagemagick is installed with ghostscript support - which is not available in CI"
|
||||||
|
end
|
||||||
|
pdf_to_text = described_class.new(upload: upload, user: user, llm_model: llm_model)
|
||||||
|
pages = []
|
||||||
|
|
||||||
|
responses = [
|
||||||
|
"<chunk>Page 1: LLM chunk 1</chunk><chunk>Page 1: LLM chunk 2</chunk>",
|
||||||
|
"<chunk>Page 2: LLM chunk 3</chunk>",
|
||||||
|
]
|
||||||
|
|
||||||
|
DiscourseAi::Completions::Llm.with_prepared_responses(responses) do |_, _, _prompts|
|
||||||
|
pdf_to_text.extract_text { |page| pages << page }
|
||||||
|
end
|
||||||
|
|
||||||
|
expect(pages).to eq(["Page 1: LLM chunk 1", "Page 1: LLM chunk 2", "Page 2: LLM chunk 3"])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -24,10 +24,10 @@ RSpec.describe DiscourseAi::Admin::RagDocumentFragmentsController do
|
||||||
end
|
end
|
||||||
|
|
||||||
describe "POST #upload_file" do
|
describe "POST #upload_file" do
|
||||||
let :fake_pdf do
|
let :fake_image do
|
||||||
@cleanup_files ||= []
|
@cleanup_files ||= []
|
||||||
tempfile = Tempfile.new(%w[test .pdf])
|
tempfile = Tempfile.new(%w[test .png])
|
||||||
tempfile.write("fake pdf")
|
tempfile.write("fake image")
|
||||||
tempfile.rewind
|
tempfile.rewind
|
||||||
@cleanup_files << tempfile
|
@cleanup_files << tempfile
|
||||||
tempfile
|
tempfile
|
||||||
|
@ -46,26 +46,26 @@ RSpec.describe DiscourseAi::Admin::RagDocumentFragmentsController do
|
||||||
end
|
end
|
||||||
|
|
||||||
it "rejects PDF files if site setting is not enabled" do
|
it "rejects PDF files if site setting is not enabled" do
|
||||||
SiteSetting.ai_rag_pdf_images_enabled = false
|
SiteSetting.ai_rag_images_enabled = false
|
||||||
|
|
||||||
post "/admin/plugins/discourse-ai/rag-document-fragments/files/upload.json",
|
post "/admin/plugins/discourse-ai/rag-document-fragments/files/upload.json",
|
||||||
params: {
|
params: {
|
||||||
file: Rack::Test::UploadedFile.new(fake_pdf),
|
file: Rack::Test::UploadedFile.new(fake_image),
|
||||||
}
|
}
|
||||||
|
|
||||||
expect(response.status).to eq(400)
|
expect(response.status).to eq(400)
|
||||||
end
|
end
|
||||||
|
|
||||||
it "allows PDF files if site setting is enabled" do
|
it "allows image files if site setting is enabled" do
|
||||||
SiteSetting.ai_rag_pdf_images_enabled = true
|
SiteSetting.ai_rag_images_enabled = true
|
||||||
|
|
||||||
post "/admin/plugins/discourse-ai/rag-document-fragments/files/upload.json",
|
post "/admin/plugins/discourse-ai/rag-document-fragments/files/upload.json",
|
||||||
params: {
|
params: {
|
||||||
file: Rack::Test::UploadedFile.new(fake_pdf),
|
file: Rack::Test::UploadedFile.new(fake_image),
|
||||||
}
|
}
|
||||||
|
|
||||||
upload = Upload.last
|
upload = Upload.last
|
||||||
expect(upload.original_filename).to end_with(".pdf")
|
expect(upload.original_filename).to end_with(".png")
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue