FEATURE: GPT-4 turbo vision support (#575)

Recent release of GPT-4 turbo adds vision support, this adds the pipeline for sending images to Open AI.
2024-04-11 16:22:59 +10:00 · 2024-04-11 16:22:59 +10:00 · 23d12c8927
parent a77658e2b1
commit 23d12c8927
2 changed files with 77 additions and 0 deletions
--- a/lib/completions/dialects/chat_gpt.rb
+++ b/lib/completions/dialects/chat_gpt.rb
@ -65,6 +65,7 @@ module DiscourseAi
                  user_message[:name] = msg[:id]
                end
              end
              user_message[:content] = inline_images(user_message[:content], msg)
              user_message
            end
          end
@ -106,6 +107,30 @@ module DiscourseAi
        private
        def inline_images(content, message)
          if model_name.include?("gpt-4-vision") || model_name == "gpt-4-turbo"
            content = message[:content]
            encoded_uploads = prompt.encoded_uploads(message)
            if encoded_uploads.present?
              new_content = []
              new_content.concat(
                encoded_uploads.map do |details|
                  {
                    type: "image_url",
                    image_url: {
                      url: "data:#{details[:mime_type]};base64,#{details[:base64]}",
                    },
                  }
                end,
              )
              new_content << { type: "text", text: content }
              content = new_content
            end
          end
          content
        end
        def per_message_overhead
          # open ai defines about 4 tokens per message of overhead
          4
--- a/spec/lib/completions/endpoints/open_ai_spec.rb
+++ b/spec/lib/completions/endpoints/open_ai_spec.rb
@ -165,6 +165,58 @@ RSpec.describe DiscourseAi::Completions::Endpoints::OpenAi do
    EndpointsCompliance.new(self, endpoint, DiscourseAi::Completions::Dialects::ChatGpt, user)
  end
  let(:image100x100) { plugin_file_from_fixtures("100x100.jpg") }
  let(:upload100x100) do
    UploadCreator.new(image100x100, "image.jpg").create_for(Discourse.system_user.id)
  end
  describe "image support" do
    it "can handle images" do
      llm = DiscourseAi::Completions::Llm.proxy("open_ai:gpt-4-turbo")
      prompt =
        DiscourseAi::Completions::Prompt.new(
          "You are image bot",
          messages: [type: :user, id: "user1", content: "hello", upload_ids: [upload100x100.id]],
        )
      encoded = prompt.encoded_uploads(prompt.messages.last)
      parsed_body = nil
      stub_request(:post, "https://api.openai.com/v1/chat/completions").with(
        body:
          proc do |req_body|
            parsed_body = JSON.parse(req_body, symbolize_names: true)
            true
          end,
      ).to_return(status: 200, body: { choices: [message: { content: "nice pic" }] }.to_json)
      completion = llm.generate(prompt, user: user)
      expect(completion).to eq("nice pic")
      expected_body = {
        model: "gpt-4-turbo",
        messages: [
          { role: "system", content: "You are image bot" },
          {
            role: "user",
            content: [
              {
                type: "image_url",
                image_url: {
                  url: "data:#{encoded[0][:mime_type]};base64,#{encoded[0][:base64]}",
                },
              },
              { type: "text", text: "hello" },
            ],
            name: "user1",
          },
        ],
      }
      expect(parsed_body).to eq(expected_body)
    end
  end
  describe "#perform_completion!" do
    context "when using regular mode" do
      context "with simple prompts" do