first commit

This commit is contained in:
Zhuohan Li 2023-06-21 23:36:19 +08:00
commit 6cd15ede01
50 changed files with 1319 additions and 0 deletions

11
.editorconfig Normal file
View File

@ -0,0 +1,11 @@
# editorconfig.org
root = true
[*]
charset = utf-8
end_of_line = lf
indent_size = 2
indent_style = space
insert_final_newline = true
trim_trailing_whitespace = true

29
.github/release-drafter.yml vendored Normal file
View File

@ -0,0 +1,29 @@
name-template: 'v$NEXT_MINOR_VERSION'
tag-template: 'v$NEXT_MINOR_VERSION'
prerelease: true
exclude-labels:
- 'skip-changelog'
categories:
- title: '🚀 Features'
labels:
- 'new-feature'
- 'feature'
- 'enhancement'
- title: '🐛 Bug fixes'
labels:
- 'fix'
- 'bugfix'
- 'bug'
- title: '📖 Docs'
labels:
- 'docs'
- title: '📦 Dependencies'
labels:
- 'dependencies'
- title: '🧰 Maintenance'
label: 'chore'
change-template: '- #$NUMBER: $TITLE'
template: |
## Changes
$CHANGES

14
.github/workflows/release-notes.yml vendored Normal file
View File

@ -0,0 +1,14 @@
name: Release notes
on:
push:
branches:
- master
jobs:
update_release_draft:
runs-on: ubuntu-latest
steps:
- uses: release-drafter/release-drafter@v5
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

43
.gitignore vendored Normal file
View File

@ -0,0 +1,43 @@
# Ignore docs files
_gh_pages
_site
.ruby-version
.sass-cache
.jekyll-cache
# Numerous always-ignore extensions
*.diff
*.err
*.orig
*.log
*.rej
*.swo
*.swp
*.zip
*.vi
*~
# OS or Editor folders
.DS_Store
._*
Thumbs.db
.cache
.project
.settings
.tmproj
*.esproj
nbproject
*.sublime-project
*.sublime-workspace
.idea
# Komodo
*.komodoproject
.komodotools
# grunt-html-validation
validation-status.json
validation-report.json
# Folders to ignore
node_modules

BIN
.jekyll-metadata (1) Normal file

Binary file not shown.

10
404.html Normal file
View File

@ -0,0 +1,10 @@
---
layout: default
title: "404: Page not found"
permalink: 404.html
---
<div class="page">
<h1 class="page-title">404: Page not found</h1>
<p class="lead">Sorry, we've misplaced that URL or it's pointing to something that doesn't exist. <a href="{{ site.baseurl }}/">Head back home</a> to try finding it again.</p>
</div>

6
Gemfile Normal file
View File

@ -0,0 +1,6 @@
source "https://rubygems.org"
gem "jekyll"
gem "jekyll-gist"
gem "jekyll-paginate"
gem "jekyll-seo-tag"

89
Gemfile.lock Normal file
View File

@ -0,0 +1,89 @@
GEM
remote: https://rubygems.org/
specs:
addressable (2.8.4)
public_suffix (>= 2.0.2, < 6.0)
colorator (1.1.0)
concurrent-ruby (1.2.2)
em-websocket (0.5.3)
eventmachine (>= 0.12.9)
http_parser.rb (~> 0)
eventmachine (1.2.7)
faraday (2.7.7)
faraday-net_http (>= 2.0, < 3.1)
ruby2_keywords (>= 0.0.4)
faraday-net_http (3.0.2)
ffi (1.15.5)
forwardable-extended (2.6.0)
google-protobuf (3.23.3-x86_64-darwin)
http_parser.rb (0.8.0)
i18n (1.14.1)
concurrent-ruby (~> 1.0)
jekyll (4.3.2)
addressable (~> 2.4)
colorator (~> 1.0)
em-websocket (~> 0.5)
i18n (~> 1.0)
jekyll-sass-converter (>= 2.0, < 4.0)
jekyll-watch (~> 2.0)
kramdown (~> 2.3, >= 2.3.1)
kramdown-parser-gfm (~> 1.0)
liquid (~> 4.0)
mercenary (>= 0.3.6, < 0.5)
pathutil (~> 0.9)
rouge (>= 3.0, < 5.0)
safe_yaml (~> 1.0)
terminal-table (>= 1.8, < 4.0)
webrick (~> 1.7)
jekyll-gist (1.5.0)
octokit (~> 4.2)
jekyll-paginate (1.1.0)
jekyll-sass-converter (3.0.0)
sass-embedded (~> 1.54)
jekyll-seo-tag (2.8.0)
jekyll (>= 3.8, < 5.0)
jekyll-watch (2.2.1)
listen (~> 3.0)
kramdown (2.4.0)
rexml
kramdown-parser-gfm (1.1.0)
kramdown (~> 2.0)
liquid (4.0.4)
listen (3.8.0)
rb-fsevent (~> 0.10, >= 0.10.3)
rb-inotify (~> 0.9, >= 0.9.10)
mercenary (0.4.0)
octokit (4.25.1)
faraday (>= 1, < 3)
sawyer (~> 0.9)
pathutil (0.16.2)
forwardable-extended (~> 2.6)
public_suffix (5.0.1)
rb-fsevent (0.11.2)
rb-inotify (0.10.1)
ffi (~> 1.0)
rexml (3.2.5)
rouge (4.1.2)
ruby2_keywords (0.0.5)
safe_yaml (1.0.5)
sass-embedded (1.63.4-x86_64-darwin)
google-protobuf (~> 3.23)
sawyer (0.9.2)
addressable (>= 2.3.5)
faraday (>= 0.17.3, < 3)
terminal-table (3.0.2)
unicode-display_width (>= 1.1.1, < 3)
unicode-display_width (2.4.2)
webrick (1.8.1)
PLATFORMS
x86_64-darwin-22
DEPENDENCIES
jekyll
jekyll-gist
jekyll-paginate
jekyll-seo-tag
BUNDLED WITH
2.4.14

9
LICENSE.md Normal file
View File

@ -0,0 +1,9 @@
# Released under MIT License
Copyright (c) 2013 Mark Otto.
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

92
README.md Normal file
View File

@ -0,0 +1,92 @@
# Poole
*The Strange Case of Dr. Jekyll and Mr. Hyde* tells the story of a lawyer investigating the connection of two persons, Dr. Henry Jekyll and Mr. Edward Hyde. Chief among the novel's supporting cast is a man by the name of Mr. Poole, Dr. Jekyll's loyal butler.
-----
Poole is the butler for [Jekyll](http://jekyllrb.com), the static site generator. It's designed and developed by [@mdo](https://twitter.com/mdo) to provide a clear and concise foundational setup for any Jekyll site. It does so by furnishing a full vanilla Jekyll install with example templates, pages, posts, and styles.
![Poole](https://f.cloud.github.com/assets/98681/1834359/71ae4048-73db-11e3-9a3c-df38eb170537.png)
See Poole in action with [the demo site](https://demo.getpoole.com).
There are currently two official themes built on Poole:
* [Hyde](https://hyde.getpoole.com)
* [Lanyon](https://lanyon.getpoole.com)
Individual theme feedback and bug reports should be submitted to the theme's individual repository.
## Contents
- [Usage](#usage)
- [Development](#development)
- [Author](#author)
- [License](#license)
## Usage
### 1. Install dependencies
Poole is built on Jekyll and uses its built-in SCSS compiler to generate our CSS. Before getting started, you'll need to install the Jekyll gem and related dependencies:
```bash
$ gem install jekyll jekyll-gist jekyll-sitemap jekyll-seo-tag
```
**Windows users:** Windows users have a bit more work to do, but luckily [@juthilo](https://github.com/juthilo) has your back with his [Run Jekyll on Windows](https://github.com/juthilo/run-jekyll-on-windows) guide.
**Need syntax highlighting?** Poole includes support for Pygments or Rouge, so install your gem of choice to make use of the built-in styling. Read more about this in the [Jekyll docs](https://jekyllrb.com/docs/liquid/tags/#code-snippet-highlighting).
### 2a. Quick start
To help anyone with any level of familiarity with Jekyll quickly get started, Poole includes everything you need for a basic Jekyll site. To that end, just download Poole and start up Jekyll.
### 2b. Roll your own Jekyll site
Folks wishing to use Jekyll's templates and styles can do so with a little bit of manual labor. Download Poole and then copy what you need (likely `_layouts/`, `*.html` files, `atom.xml` for RSS, and `assets/` for CSS, JS, etc.).
### 3. Running locally
To see your Jekyll site with Poole applied, start a Jekyll server. In Terminal, from `/poole` (or whatever your Jekyll site's root directory is named):
```bash
$ jekyll serve
```
Open <http://localhost:4000> in your browser, and voilà.
### 4. Serving it up
If you host your code on GitHub, you can use [GitHub Pages](https://pages.github.com) to host your project.
1. Fork this repo and switch to the `gh-pages` branch.
1. If you're [using a custom domain name](https://help.github.com/articles/setting-up-a-custom-domain-with-github-pages), modify the `CNAME` file to point to your new domain.
2. If you're not using a custom domain name, **modify the `baseurl` in `_config.yml`** to point to your GitHub Pages URL. Example: for a repo at `github.com/username/poole`, use `http://username.github.io/poole/`. **Be sure to include the trailing slash.**
3. Done! Head to your GitHub Pages URL or custom domain.
No matter your production or hosting setup, be sure to verify the `baseurl` option file and `CNAME` settings. Not applying this correctly can mean broken styles on your site.
## Development
Poole has two branches, but only one is used for active development.
- `master` for development. **All pull requests should be to submitted against `master`.**
- `gh-pages` for our hosted site, which includes our analytics tracking code. **Please avoid using this branch.**
CSS is handled via Jeykll's built-in Sass compiler. Source Sass files are located in `_sass/`, included into `styles.scss`, and compile to `styles.css`.
## Author
**Mark Otto**
- <https://github.com/mdo>
- <https://twitter.com/mdo>
## License
Open sourced under the [MIT license](LICENSE.md).
<3

41
_config.yml Normal file
View File

@ -0,0 +1,41 @@
# Setup
title: ""
tagline: ""
url: https://vllm.ai
paginate: 1
baseurl: ""
permalink: pretty
# Gems
plugins:
- jekyll-gist
- jekyll-paginate
- jekyll-seo-tag
# Optimize Jekyll
exclude:
- .editorconfig
- .git
- .jekyll-cache
- Gemfile
- Gemfile.lock
- LICENSE.md
- README.md
sass:
sass_dir: _sass
style: :compressed
# Options
# Replace this value and uncomment to enable Google Analytics tracking
# ga_analytics: UA-000000-0
# Specify the author for blog posts
author:
name: vLLM Team
url: https://github.com/vllm-project/vllm
email: vllm.proj@gmail.com
# Custom vars
version: 3.0.0

5
_includes/figures.html Normal file
View File

@ -0,0 +1,5 @@
<div id="html" markdown="0">
<p align="center" markdown="0">
<img alt="vLLM" src="assets/figures/perf_a100_n1_light.png" width=55%>
</p>
</div>

17
_includes/head.html Normal file
View File

@ -0,0 +1,17 @@
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="twitter:image" content="https://vllm.ai/assets/logos/vllm-logo-text-light.png">
<meta name="twitter:image:alt" content={vLLM}>
<title>
{{ page.title }}
</title>
<link rel="stylesheet" href="{{ 'styles.css' | relative_url }}">
<link rel="apple-touch-icon-precomposed" sizes="144x144" href="{{ '/assets/apple-touch-icon-precomposed.png' | relative_url }}">
<link rel="shortcut icon" href="{{ '/assets/favicon.ico' | relative_url }}">
<link rel="alternate" type="application/atom+xml" title="{{ site.title }}" href="{{ 'atom.xml' | relative_url }}">
{% seo title=false %}
</head>

33
_layouts/default.html Normal file
View File

@ -0,0 +1,33 @@
<!doctype html>
<html lang="en">
{% include head.html %}
<body>
<div class="container content">
<br>
<br>
<main>
{{ content }}
</main>
<footer class="footer">
<small>
&copy; <time datetime="{{ site.time | date_to_xmlschema }}">{{ site.time | date: '%Y' }}</time>. vLLM Team. All rights reserved.
</small>
</footer>
</div>
{% if site.ga_analytics %}
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
ga('create', '{{ site.ga_analytics }}', 'auto');
ga('send', 'pageview');
</script>
{% endif %}
</body>
</html>

8
_layouts/page.html Normal file
View File

@ -0,0 +1,8 @@
---
layout: default
---
<article class="page">
<h1 class="page-title">{{ page.title }}</h1>
{{ content }}
</article>

25
_layouts/post.html Normal file
View File

@ -0,0 +1,25 @@
---
layout: default
---
<article class="post">
<h1 class="post-title">{{ page.title }}</h1>
<time datetime="{{ page.date | date_to_xmlschema }}" class="post-date">{{ page.date | date_to_string }}</time>
{{ content }}
</article>
{% if site.related_posts != empty %}
<aside class="related">
<h3>Related posts</h3>
<ul class="related-posts">
{% for post in site.related_posts limit:3 %}
<li>
<a href="{{ site.baseurl }}{{ post.url }}">
{{ post.title }}
<small><time datetime="{{ post.date | date_to_xmlschema }}">{{ post.date | date_to_string }}</time></small>
</a>
</li>
{% endfor %}
</ul>
</aside>
{% endif %}

View File

@ -0,0 +1,161 @@
---
layout: post
title: "vLLM: Easy, Fast, and Cheap LLM Serving with PagedAttention"
---
<p align="center">
<picture>
<img src="assets/logos/vllm-logo-text-light.png" width="65%">
</picture>
</p>
# *vLLM:* Easy, Fast, and Cheap LLM Serving with PagedAttention
*By Woosuk Kwon\*, Zhuohan Li\*, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Yu, Joey Gonzalez, Hao Zhang, and Ion Stoica (\* Equal Contribution). June 20th, 2023*
<p align="left">
<a href="https://github.com/vllm-project/vllm"><b>GitHub</b></a> | <a href="https://vllm.readthedocs.io/en/latest/"><b>Documentation</b></a> | <b>Paper (Stay Tuned)</b>
</p>
---
LLMs promise to fundamentally change how we use AI across all industries. However, actually serving these models is challenging and can be surprisingly slow even on expensive hardware. Today we are excited to introduce vLLM, an open-source library for fast LLM inference and serving. vLLM utilizes **PagedAttention**, our new attention algorithm that effectively manages attention keys and values. vLLM equipped with PagedAttention redefines the new state of the art in LLM serving: it delivers up to 24x higher throughput than HuggingFace Transformers, without requiring any model architecture changes.
vLLM has been developed at UC Berkeley and deployed at [Chatbot Arena and Vicuna Demo](https://chat.lmsys.org) for the past two months. It is the core technology that makes LLM serving affordable even for a small research team like LMSYS with limited compute resources. Try out vLLM now with a single command at our [GitHub repository](https://github.com/vllm-project/vllm).
## Beyond State-of-the-art Performance
We compare the throughput of vLLM with [HuggingFace Transformers (HF)](https://huggingface.co/docs/transformers/main_classes/text_generation), the most popular LLM library and [HuggingFace Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference), the previous state of the art. We evaluate in two settings: LLaMA-7B on an NVIDIA A10G GPU and LLaMA-13B on an NVIDIA A100 GPU (40GB). We sample the requests input/output lengths from the ShareGPT dataset. In our experiments, vLLM achieves up to **24x** higher throughput compared to HF and up to **3.5x** higher throughput than TGI.
<p align="center">
<picture>
<img src="assets/figures/perf_a100_n1_light.png" width="45%">
</picture><picture>
<img src="assets/figures/perf_a10g_n1_light.png" width="45%">
</picture><br>
Serving throughput when each request asks for <em> one output completion</em>. vLLM achieves 14x - 24x higher throughput than HF and 2.2x - 2.5x higher throughput than TGI.
</p>
<p align="center">
<picture>
<img src="assets/figures/perf_a100_n3_light.png" width="45%">
</picture><picture>
<img src="assets/figures/perf_a10g_n3_light.png" width="45%">
</picture>
<br>Serving throughput when each request asks for <em>three parallel output completions</em>. vLLM achieves 8.5x - 15x higher throughput than HF and 3.3x - 3.5x higher throughput than TGI.
</p>
## The Secret Sauce: PagedAttention
In vLLM, we identify that the performance of LLM serving is bottlenecked by memory. In the autoregressive decoding process, all the input tokens to the LLM produce their attention key and value tensors, and these tensors are kept in GPU memory to generate next tokens. These cached key and value tensors are often referred to as KV cache. The KV cache is
- *Large:* Takes up to 1.7GB for a single sequence in LLaMA-13B.
- *Dynamic:* Its size depends on the sequence length, which is highly variable and unpredictable.
As a result, efficiently managing the KV cache presents a significant challenge. We find that existing systems waste **60% 80%** of memory due to fragmentation and over-reservation.
To address this problem, we introduce **PagedAttention**, an attention algorithm inspired by the classic idea of virtual memory and paging in operating systems. Unlike the traditional attention algorithms, PagedAttention allows storing continuous keys and values in non-contiguous memory space. Specifically, PagedAttention partitions the KV cache of each sequence into blocks, each block containing the keys and values for a fixed number of tokens. During the attention computation, the PagedAttention kernel identifies and fetches these blocks efficiently.
<p align="center">
<picture>
<img src="assets/figures/annimation0.gif" width="80%">
</picture>
<br>
<em>PagedAttention:</em> KV Cache are partitioned into blocks. Blocks do not need to be contiguous in memory space.
</p>
Because the blocks do not need to be contiguous in memory, we can manage the keys and values in a more flexible way as in OSs virtual memory: one can think of blocks as pages, tokens as bytes, and sequences as processes. The contiguous *logical blocks* of a sequence are mapped to non-contiguous *physical blocks* via a block table. The physical blocks are allocated on demand as new tokens are generated.
<p align="center">
<picture>
<img src="assets/figures/annimation1.gif" width="100%">
</picture>
<br>
Example generation process for a request with PagedAttention.
</p>
In PagedAttention, memory waste only happens in the last block of a sequence. In practice, this results in near-optimal memory usage, with a mere waste of under 4%. This boost in memory efficiency proves highly beneficial: It allows the system to batch more sequences together, increase GPU utilization, and thereby significantly increase the throughput as shown in the performance result above.
PagedAttention has another key advantage: efficient memory sharing. For example, in *parallel sampling*, multiple output sequences are generated from the same prompt. In this case, the computation and memory for the prompt can be shared between the output sequences.
<p align="center">
<picture>
<img src="assets/figures/annimation2.gif" width="80%">
</picture>
<br>
Example of parallel sampling.
</p>
PagedAttention naturally enables memory sharing through its block table. Similar to how processes share physical pages, different sequences in PagedAttention can share the blocks by mapping their logical blocks to the same physical block. To ensure safe sharing, PagedAttention keeps track of the reference counts of the physical blocks and implements the *Copy-on-Write* mechanism.
<p align="center">
<picture>
<img src="assets/figures/annimation3.gif" width="100%">
</picture>
<br>
Example generation process for a request that samples multiple outputs.
</p>
PageAttentions memory sharing greatly reduces the memory overhead of complex sampling algorithms, such as parallel sampling and beam search, cutting their memory usage by up to 55%. This can translate into up to 2.2x improvement in throughput. This makes such sampling methods practical in LLM services.
PagedAttention is the core technology behind vLLM, our LLM inference and serving engine that supports a variety of models with high performance and an easy-to-use interface. For more technical details about vLLM and PagedAttention, check out our [GitHub repo](https://github.com/vllm-project/vllm) and stay tuned for our paper.
## The Silent Hero Behind LMSYS Vicuna and Chatbot Arena
This April, [LMSYS](https://lmsys.org) developed the popular Vicuna chatbot models and made them publicly available. Since then, Vicuna has been served in [Chatbot Arena](https://arena.lmsys.org/) for millions of users. Initially, LMSYS FastChat adopted a HF Transformers based [serving backend](https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/model_worker.py) to serve the chat demo. As the demo became more popular, the peak traffic ramped up several times, making the HF backend a significant bottleneck. The LMSYS and vLLM team have worked together and soon developed the FastChat-vLLM integration to use vLLM [as the new backend](https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/vllm_worker.py) in order to support the growing demands (up to 5x more traffic). In an early [internal micro-benchmark](https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/test_throughput.py) by LMSYS, the vLLM serving backend can **achieve up to 30x higher throughput than an initial HF backend.**
Since mid-April, the most popular models such as Vicuna, Koala, and LLaMA, have all been successfully served using the FastChat-vLLM integration With FastChat as the multi-model chat serving frontend and vLLM as the inference backend, LMSYS is able to harness a limited number of university-sponsored GPUs to serve Vicuna to millions of users with *high throughput* and *low latency*. LMSYS is expanding the use of vLLM to a wider range of models, including Databricks Dolly, LAIONs OpenAsssiant, and Stability AIs stableLM. The [support for more models](https://vllm.readthedocs.io/en/latest/models/supported_models.html) is being developed and forthcoming.
<p align="center">
<picture>
<img src="assets/figures/lmsys_traffic.png" width="100%">
</picture>
<br>
Requests served by FastChat-vLLM integration in the Chatbot Arena between April to May. Indeed, more than half of the requests to Chatbot Arena use vLLM as the inference backend.
</p>
This utilization of vLLM has also significantly reduced operational costs. With vLLM, LMSYS was able to cut the number of GPUs used for serving the above traffic by 50%. vLLM has been handling an average of 30K requests daily and a peak of 60K, which is a clear demonstration of vLLMs robustness.
## Get started with vLLM
Install vLLM with the following command (check out our [installation guide](https://vllm.readthedocs.io/en/latest/getting_started/installation.html) for more):
```bash
$ pip install vllm
```
vLLM can be used for both offline inference and online serving. To use vLLM for offline inference, you can import vLLM and use the `LLM` class in your Python scripts:
```python
from vllm import LLM
prompts = ["Hello, my name is", "The capital of France is"] # Sample prompts.
llm = LLM(model="lmsys/vicuna-7b-v1.3") # Create an LLM.
outputs = llm.generate(prompts) # Generate texts from the prompts.
```
To use vLLM for online serving, you can start an OpenAI API-compatible server via:
```bash
$ python -m vllm.entrypoints.openai.api_server --model lmsys/vicuna-7b-v1.3
```
You can query the server with the same format as OpenAI API:
```bash
$ curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "lmsys/vicuna-7b-v1.3",
"prompt": "San Francisco is a",
"max_tokens": 7,
"temperature": 0
}'
```
For more ways to use vLLM, please check out the [quickstart guide](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html).
<br>
-----
*Blog written by Woosuk Kwon and Zhuohan Li (UC Berkeley). Special thanks to Hao Zhang for the integration of vLLM and FastChat and for writing the corresponding section. We thank the entire teamSiyuan Zhuang, Ying Sheng, Lianmin Zheng (UC Berkeley), Cody Yu (Independent Researcher), Joey Gonzalez (UC Berkeley), Hao Zhang (UC Berkeley & UCSD), and Ion Stoica (UC Berkeley).*

70
_sass/_base.scss Normal file
View File

@ -0,0 +1,70 @@
// Body resets
//
// Update the foundational and global aspects of the page.
* {
box-sizing: border-box;
}
body {
margin: 0;
font-family: var(--body-font);
font-size: var(--body-font-size);
line-height: var(--body-line-height);
color: var(--body-color);
background-color: var(--body-bg);
-webkit-text-size-adjust: 100%;
-ms-text-size-adjust: 100%;
}
// No `:visited` state is required by default (browsers will use `a`)
a {
color: var(--link-color);
// `:focus` is linked to `:hover` for basic accessibility
&:hover,
&:focus {
color: var(--link-hover-color);
}
strong {
color: inherit;
}
}
img {
// display: block;
max-width: 100%;
margin-bottom: var(--spacer);
border-radius: var(--border-radius);
}
table {
margin-bottom: 1rem;
width: 100%;
border: 0 solid var(--border-color);
border-collapse: collapse;
}
td,
th {
padding: .25rem .5rem;
border-color: inherit;
border-style: solid;
border-width: 0;
border-bottom-width: 1px;
}
th {
text-align: left;
}
thead th {
border-bottom-color: currentColor;
}
mark {
padding: .15rem;
background-color: var(--yellow-100);
border-radius: .125rem;
}

58
_sass/_code.scss Normal file
View File

@ -0,0 +1,58 @@
// Code
//
// Inline and block-level code snippets. Includes tweaks to syntax highlighted
// snippets from Pygments/Rouge and Gist embeds.
code,
pre {
font-family: var(--code-font);
}
code {
font-size: 85%;
}
pre {
display: block;
margin-top: 0;
margin-bottom: var(--spacer-3);
overflow: auto;
}
.highlight {
padding: var(--spacer);
margin-bottom: var(--spacer);
background-color: var(--code-bg);
border-radius: var(--border-radius);
pre {
margin-bottom: 0;
}
// Triple backticks (code fencing) doubles the .highlight elements
.highlight {
padding: 0;
}
}
.rouge-table {
margin-bottom: 0;
font-size: 100%;
&,
td,
th {
border: 0;
}
.gutter {
vertical-align: top;
user-select: none;
opacity: .25;
}
}
// Gist via GitHub Pages
.gist .markdown-body {
padding: 15px !important;
}

16
_sass/_layout.scss Normal file
View File

@ -0,0 +1,16 @@
// Layout
//
// Styles for managing the structural hierarchy of the site.
.container {
max-width: 50rem;
padding-left: var(--spacer-2);
padding-right: var(--spacer-2);
margin-left: auto;
margin-right: auto;
}
footer {
margin-top: var(--spacer-3);
margin-bottom: var(--spacer-3);
}

23
_sass/_masthead.scss Normal file
View File

@ -0,0 +1,23 @@
// Masthead
//
// Super small header above the content for site name and short description.
.masthead {
padding-top: var(--spacer);
padding-bottom: var(--spacer);
margin-bottom: var(--spacer-3);
}
.masthead-title {
margin-bottom: 0;
a {
color: inherit;
text-decoration: none;
}
small {
font-weight: 400;
opacity: .5;
}
}

12
_sass/_message.scss Normal file
View File

@ -0,0 +1,12 @@
// Messages
//
// Show alert messages to users. You may add it to single elements like a `<p>`,
// or to a parent if there are multiple elements to show.
.message {
padding: var(--spacer);
margin-bottom: var(--spacer);
color: var(--gray-900);
background-color: var(--yellow-100);
border-radius: var(--border-radius);
}

52
_sass/_pagination.scss Normal file
View File

@ -0,0 +1,52 @@
// Pagination
//
// Super lightweight (HTML-wise) blog pagination. `span`s are provide for when
// there are no more previous or next posts to show.
.pagination {
display: flex;
margin: 0 -1.5rem var(--spacer);
color: var(--gray-500);
text-align: center;
}
// Pagination items can be `span`s or `a`s
.pagination-item {
display: block;
padding: var(--spacer);
text-decoration: none;
border: solid var(--border-color);
border-width: 1px 0;
&:first-child {
margin-bottom: -1px;
}
}
// Only provide a hover state for linked pagination items
a.pagination-item:hover {
background-color: var(--border-color);
}
@media (min-width: 30em) {
.pagination {
margin: var(--spacer-3) 0;
}
.pagination-item {
float: left;
width: 50%;
border-width: 1px;
&:first-child {
margin-bottom: 0;
border-top-left-radius: var(--border-radius);
border-bottom-left-radius: var(--border-radius);
}
&:last-child {
margin-left: -1px;
border-top-right-radius: var(--border-radius);
border-bottom-right-radius: var(--border-radius);
}
}
}

67
_sass/_posts.scss Normal file
View File

@ -0,0 +1,67 @@
// Posts and pages
//
// Each post is wrapped in `.post` and is used on default and post layouts. Each
// page is wrapped in `.page` and is only used on the page layout.
.page,
.post {
margin-bottom: 4em;
li + li {
margin-top: .25rem;
}
}
// Blog post or page title
.page-title,
.post-title {
color: var(--heading-color);
}
.page-title,
.post-title {
margin-top: 0;
}
.post-title a {
color: inherit;
text-decoration: none;
&:hover,
&:focus {
text-decoration: underline;
}
}
// Meta data line below post title
.post-date {
display: block;
margin-top: -.5rem;
margin-bottom: var(--spacer);
color: var(--gray-600);
}
// Related posts
.related {
padding-top: var(--spacer-2);
padding-bottom: var(--spacer-2);
margin-bottom: var(--spacer-2);
border-top: 1px solid var(--border-color);
border-bottom: 1px solid var(--border-color);
}
.related-posts {
padding-left: 0;
list-style: none;
h3 {
margin-top: 0;
}
a {
text-decoration: none;
small {
color: var(--gray-600);
}
}
}

65
_sass/_syntax.scss Normal file
View File

@ -0,0 +1,65 @@
.highlight .hll { background-color: #ffc; }
.highlight .c { color: #999; } /* Comment */
.highlight .err { color: #a00; background-color: #faa } /* Error */
.highlight .k { color: #069; } /* Keyword */
.highlight .o { color: #555 } /* Operator */
.highlight .cm { color: #09f; font-style: italic } /* Comment.Multiline */
.highlight .cp { color: #099 } /* Comment.Preproc */
.highlight .c1 { color: #999; } /* Comment.Single */
.highlight .cs { color: #999; } /* Comment.Special */
.highlight .gd { background-color: #fcc; border: 1px solid #c00 } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .gr { color: #f00 } /* Generic.Error */
.highlight .gh { color: #030; } /* Generic.Heading */
.highlight .gi { background-color: #cfc; border: 1px solid #0c0 } /* Generic.Inserted */
.highlight .go { color: #aaa } /* Generic.Output */
.highlight .gp { color: #009; } /* Generic.Prompt */
.highlight .gs { } /* Generic.Strong */
.highlight .gu { color: #030; } /* Generic.Subheading */
.highlight .gt { color: #9c6 } /* Generic.Traceback */
.highlight .kc { color: #069; } /* Keyword.Constant */
.highlight .kd { color: #069; } /* Keyword.Declaration */
.highlight .kn { color: #069; } /* Keyword.Namespace */
.highlight .kp { color: #069 } /* Keyword.Pseudo */
.highlight .kr { color: #069; } /* Keyword.Reserved */
.highlight .kt { color: #078; } /* Keyword.Type */
.highlight .m { color: #f60 } /* Literal.Number */
.highlight .s { color: #d44950 } /* Literal.String */
.highlight .na { color: #4f9fcf } /* Name.Attribute */
.highlight .nb { color: #366 } /* Name.Builtin */
.highlight .nc { color: #0a8; } /* Name.Class */
.highlight .no { color: #360 } /* Name.Constant */
.highlight .nd { color: #99f } /* Name.Decorator */
.highlight .ni { color: #999; } /* Name.Entity */
.highlight .ne { color: #c00; } /* Name.Exception */
.highlight .nf { color: #c0f } /* Name.Function */
.highlight .nl { color: #99f } /* Name.Label */
.highlight .nn { color: #0cf; } /* Name.Namespace */
.highlight .nt { color: #2f6f9f; } /* Name.Tag */
.highlight .nv { color: #033 } /* Name.Variable */
.highlight .ow { color: #000; } /* Operator.Word */
.highlight .w { color: #bbb } /* Text.Whitespace */
.highlight .mf { color: #f60 } /* Literal.Number.Float */
.highlight .mh { color: #f60 } /* Literal.Number.Hex */
.highlight .mi { color: #f60 } /* Literal.Number.Integer */
.highlight .mo { color: #f60 } /* Literal.Number.Oct */
.highlight .sb { color: #c30 } /* Literal.String.Backtick */
.highlight .sc { color: #c30 } /* Literal.String.Char */
.highlight .sd { color: #c30; font-style: italic } /* Literal.String.Doc */
.highlight .s2 { color: #c30 } /* Literal.String.Double */
.highlight .se { color: #c30; } /* Literal.String.Escape */
.highlight .sh { color: #c30 } /* Literal.String.Heredoc */
.highlight .si { color: #a00 } /* Literal.String.Interpol */
.highlight .sx { color: #c30 } /* Literal.String.Other */
.highlight .sr { color: #3aa } /* Literal.String.Regex */
.highlight .s1 { color: #c30 } /* Literal.String.Single */
.highlight .ss { color: #fc3 } /* Literal.String.Symbol */
.highlight .bp { color: #366 } /* Name.Builtin.Pseudo */
.highlight .vc { color: #033 } /* Name.Variable.Class */
.highlight .vg { color: #033 } /* Name.Variable.Global */
.highlight .vi { color: #033 } /* Name.Variable.Instance */
.highlight .il { color: #f60 } /* Literal.Number.Integer.Long */
.css .o,
.css .o + .nt,
.css .nt + .nt { color: #999; }

16
_sass/_toc.scss Normal file
View File

@ -0,0 +1,16 @@
// Table of Contents
#markdown-toc {
padding: var(--spacer-2) var(--spacer-3);
margin-bottom: var(--spacer-2);
border: solid var(--border-color);
border-width: 1px 0;
&::before {
display: block;
margin-left: calc(var(--spacer-3) * -1);
content: "Contents";
font-size: 85%;
font-weight: 500;
}
}

117
_sass/_type.scss Normal file
View File

@ -0,0 +1,117 @@
// Typography
//
// Headings, body text, lists, and other misc typographic elements.
h1, h2, h3, h4, h5, h6 {
margin-bottom: .5rem;
font-weight: 600;
line-height: 1.25;
color: var(--heading-color);
}
h1 {
font-size: 2rem;
margin-bottom: 1rem;
}
h2 {
margin-top: 2rem;
margin-bottom: 1rem;
font-size: 1.5rem;
}
h3 {
margin-top: 1.5rem;
font-size: 1.25rem;
}
h4, h5, h6 {
margin-top: 1rem;
font-size: 1rem;
}
p {
margin-top: 0;
margin-bottom: 1rem;
}
ul, ol, dl {
margin-top: 0;
margin-bottom: 1rem;
}
dt {
font-weight: bold;
}
dd {
margin-bottom: .5rem;
}
hr {
position: relative;
margin: var(--spacer-2) 0;
border: 0;
border-top: 1px solid var(--border-color);
}
abbr {
font-size: 85%;
font-weight: bold;
color: var(--gray-600);
text-transform: uppercase;
&[title] {
cursor: help;
border-bottom: 1px dotted var(--border-color);
}
}
blockquote {
padding: .5rem 1rem;
margin: .8rem 0;
color: var(--gray-500);
border-left: .25rem solid var(--border-color);
p:last-child {
margin-bottom: 0;
}
@media (min-width: 30em) {
padding-right: 5rem;
padding-left: 1.25rem;
}
}
figure {
margin: 0;
}
// Markdown footnotes
//
// See the example content post for an example.
// Footnote number within body text
a[href^="#fn:"],
// Back to footnote link
a[href^="#fnref:"] {
display: inline-block;
margin-left: .1rem;
font-weight: bold;
}
// List of footnotes
.footnotes {
margin-top: 2rem;
font-size: 85%;
}
// Custom type
//
// Extend paragraphs with `.lead` for larger introductory text.
.lead {
font-size: 1.25rem;
font-weight: 300;
}

66
_sass/_variables.scss Normal file
View File

@ -0,0 +1,66 @@
:root {
--gray-000: #f8f9fa;
--gray-100: #f1f3f5;
--gray-200: #e9ecef;
--gray-300: #dee2e6;
--gray-400: #ced4da;
--gray-500: #adb5bd;
--gray-600: #868e96;
--gray-700: #495057;
--gray-800: #343a40;
--gray-900: #212529;
--red: #fa5252;
--pink: #e64980;
--grape: #be4bdb;
--purple: #7950f2;
--indigo: #4c6ef5;
--blue: #228be6;
--cyan: #15aabf;
--teal: #12b886;
--green: #40c057;
--yellow: #fab005;
--orange: #fd7e14;
--blue-300: #74c0fc;
--blue-400: #4dabf7;
--yellow-100: #fff3bf;
--body-font: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
--body-font-size: 18px;
--body-line-height: 1.5;
--body-color: var(--gray-700);
--body-bg: #fff;
--link-color: var(--blue);
--link-hover-color: #1c7ed6;
--heading-color: var(--gray-900);
--border-color: var(--gray-300);
--border-radius: .25rem;
--code-font: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
--code-color: var(--grape);
--code-bg: var(--gray-000);
--spacer: 1rem;
--spacer-2: calc(var(--spacer) * 1.5);
--spacer-3: calc(var(--spacer) * 3);
}
// @media (prefers-color-scheme: dark) {
// :root {
// --body-color: var(--gray-300);
// --body-bg: var(--gray-800);
// --heading-color: #fff;
// --link-color: var(--blue-300);
// --link-hover-color: var(--blue-400);
// --border-color: rgba(255,255,255,.15);
// --code-bg: var(--gray-900);
// }
// }

29
about.md Normal file
View File

@ -0,0 +1,29 @@
---
layout: page
title: About
---
<p class="message">
Hey there! This page is included as an example. Feel free to customize it for your own use upon downloading. Carry on!
</p>
In the novel, *The Strange Case of Dr. Jekyll and Mr. Hyde*, Mr. Poole is Dr. Jekyll's virtuous and loyal butler. Similarly, Poole is an upstanding and effective butler that helps you build Jekyll themes. It's made by [@mdo](https://twitter.com/mdo).
There are currently two themes built on Poole:
- [Hyde](https://hyde.getpoole.com)
- [Lanyon](https://lanyon.getpoole.com)
Learn more and contribute on [GitHub](https://github.com/poole).
## Setup
Some fun facts about the setup of this project include:
- Built for [Jekyll](https://jekyllrb.com)
- Developed on GitHub and hosted for free on [GitHub Pages](https://pages.github.com)
- Coded with [Atom](https://atom.io), an amazing open source code editor
Have questions or suggestions? Feel free to [open an issue on GitHub](https://github.com/poole/poole/issues/new) or [ask me on Twitter](https://twitter.com/mdo).
Thanks for reading!

18
archive.md Normal file
View File

@ -0,0 +1,18 @@
---
layout: default
title: Archive
---
# Archive
Browse all posts by month and year.
{% assign postsByYearMonth = site.posts | group_by_exp: "post", "post.date | date: '%B %Y'" %}
{% for yearMonth in postsByYearMonth %}
<h2>{{ yearMonth.name }}</h2>
<ul>
{% for post in yearMonth.items %}
<li><a href="{{ post.url }}">{{ post.title }}</a></li>
{% endfor %}
</ul>
{% endfor %}

Binary file not shown.

After

Width:  |  Height:  |  Size: 115 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 342 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 183 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 449 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 196 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 267 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 285 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 259 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 276 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 244 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 260 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 255 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 272 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 86 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 88 KiB

28
atom.xml Normal file
View File

@ -0,0 +1,28 @@
---
layout: null
---
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>{{ site.title }}</title>
<link href="{{ '/atom.xml' | relative_url }}" rel="self"/>
<link href="{{ site.url }}{{ site.baseurl }}/"/>
<updated>{{ site.time | date_to_xmlschema }}</updated>
<id>{{ site.url }}</id>
<author>
<name>{{ site.author.name }}</name>
<email>{{ site.author.email }}</email>
</author>
{% for post in site.posts %}
<entry>
<title>{{ post.title | xml_escape }}</title>
<link href="{{ site.url }}{{ site.baseurl }}{{ post.url }}"/>
<updated>{{ post.date | date_to_xmlschema }}</updated>
<id>{{ site.url }}{{ post.id }}</id>
<content type="html">{{ post.content | xml_escape }}</content>
</entry>
{% endfor %}
</feed>

23
index.html Normal file
View File

@ -0,0 +1,23 @@
---
layout: default
title: "vLLM: Easy, Fast, and Cheap LLM Serving with PagedAttention"
---
{% for post in paginator.posts %}
<article class="post">
{{ post.content }}
</article>
{% endfor %}
<!-- <div class="pagination">
{% if paginator.next_page %}
<a class="pagination-item older" href="{{ paginator.next_page_path | relative_url }}">Older</a>
{% else %}
<span class="pagination-item older">Older</span>
{% endif %}
{% if paginator.previous_page %}
<a class="pagination-item newer" href="{{ paginator.previous_page_path | prepend: relative_url }}">Newer</a>
{% else %}
<span class="pagination-item newer">Newer</span>
{% endif %}
</div> -->

19
poole-for-jekyll.gemspec Normal file
View File

@ -0,0 +1,19 @@
# frozen_string_literal: true
Gem::Specification.new do |spec|
spec.name = "poole-for-jekyll"
spec.version = "3.0.0"
spec.authors = ["Mark Otto"]
spec.email = ["markdotto@gmail.com"]
spec.summary = "The Jekyll Butler. A no frills responsive Jekyll blog theme."
spec.homepage = "https://getpoole.com"
spec.license = "MIT"
spec.files = `git ls-files -z`.split("\x0").select { |f| f.match(%r!^(assets|_layouts|_includes|_sass|LICENSE|README)!i) }
spec.add_runtime_dependency "jekyll", "~> 4.0"
spec.add_development_dependency "bundler", "~> 1.16"
spec.add_development_dependency "rake", "~> 12.0"
end

47
styles.scss Normal file
View File

@ -0,0 +1,47 @@
---
# Use a comment to ensure Jekyll reads the file to be transformed into CSS later
# only main files contain this front matter, not partials.
---
//
// ___
// /\_ \
// _____ ___ ___\//\ \ __
// /\ '__`\ / __`\ / __`\\ \ \ /'__`\
// \ \ \_\ \/\ \_\ \/\ \_\ \\_\ \_/\ __/
// \ \ ,__/\ \____/\ \____//\____\ \____\
// \ \ \/ \/___/ \/___/ \/____/\/____/
// \ \_\
// \/_/
//
// Designed, built, and released under MIT license by @mdo. Learn more at
// https://github.com/poole/poole.
@import "variables";
@import "base";
@import "type";
@import "syntax";
@import "code";
@import "layout";
@import "masthead";
@import "posts";
@import "pagination";
@import "message";
@import "toc";
// Sass for creating the swatches
.colors {
display: grid;
grid-template-columns: max-content 1fr;
dt {
width: 3rem;
height: 3rem;
border-radius: var(--border-radius);
box-shadow: inset 0 0 0 1px rgba(255,255,255,.15);
}
dd {
margin-left: var(--spacer);
}
}