mirror of https://github.com/knative/caching.git
176 lines
4.5 KiB
Go
176 lines
4.5 KiB
Go
// Copyright 2017 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Package tokenizer converts a text into a stream of tokens.
|
|
package tokenizer
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"hash/crc32"
|
|
"sort"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// Token is a non-whitespace sequence (i.e., word or punctuation) in the
|
|
// original string. This is not meant for use outside of this package.
|
|
type token struct {
|
|
Text string
|
|
Offset int
|
|
}
|
|
|
|
// Tokens is a list of Token objects.
|
|
type Tokens []*token
|
|
|
|
// newToken creates a new token object with an invalid (negative) offset, which
|
|
// will be set before the token's used.
|
|
func newToken() *token {
|
|
return &token{Offset: -1}
|
|
}
|
|
|
|
// Tokenize converts a string into a stream of tokens.
|
|
func Tokenize(s string) (toks Tokens) {
|
|
tok := newToken()
|
|
for i := 0; i < len(s); {
|
|
r, size := utf8.DecodeRuneInString(s[i:])
|
|
switch {
|
|
case unicode.IsSpace(r):
|
|
if tok.Offset >= 0 {
|
|
toks = append(toks, tok)
|
|
tok = newToken()
|
|
}
|
|
case unicode.IsPunct(r):
|
|
if tok.Offset >= 0 {
|
|
toks = append(toks, tok)
|
|
tok = newToken()
|
|
}
|
|
toks = append(toks, &token{
|
|
Text: string(r),
|
|
Offset: i,
|
|
})
|
|
default:
|
|
if tok.Offset == -1 {
|
|
tok.Offset = i
|
|
}
|
|
tok.Text += string(r)
|
|
}
|
|
i += size
|
|
}
|
|
if tok.Offset != -1 {
|
|
// Add any remaining token that wasn't yet included in the list.
|
|
toks = append(toks, tok)
|
|
}
|
|
return toks
|
|
}
|
|
|
|
// GenerateHashes generates hashes for "size" length substrings. The
|
|
// "stringifyTokens" call takes a long time to run, so not all substrings have
|
|
// hashes, i.e. we skip some of the smaller substrings.
|
|
func (t Tokens) GenerateHashes(h Hash, size int) ([]uint32, TokenRanges) {
|
|
if size == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
var css []uint32
|
|
var tr TokenRanges
|
|
for offset := 0; offset+size <= len(t); offset += size / 2 {
|
|
var b bytes.Buffer
|
|
t.stringifyTokens(&b, offset, size)
|
|
cs := crc32.ChecksumIEEE(b.Bytes())
|
|
css = append(css, cs)
|
|
tr = append(tr, &TokenRange{offset, offset + size})
|
|
h.add(cs, offset, offset+size)
|
|
if size <= 1 {
|
|
break
|
|
}
|
|
}
|
|
|
|
return css, tr
|
|
}
|
|
|
|
// stringifyTokens serializes a sublist of tokens into a bytes buffer.
|
|
func (t Tokens) stringifyTokens(b *bytes.Buffer, offset, size int) {
|
|
for j := offset; j < offset+size; j++ {
|
|
if j != offset {
|
|
b.WriteRune(' ')
|
|
}
|
|
b.WriteString(t[j].Text)
|
|
}
|
|
}
|
|
|
|
// TokenRange indicates the range of tokens that map to a particular checksum.
|
|
type TokenRange struct {
|
|
Start int
|
|
End int
|
|
}
|
|
|
|
func (t *TokenRange) String() string {
|
|
return fmt.Sprintf("[%v, %v)", t.Start, t.End)
|
|
}
|
|
|
|
// TokenRanges is a list of TokenRange objects. The chance that two different
|
|
// strings map to the same checksum is very small, but unfortunately isn't
|
|
// zero, so we use this instead of making the assumption that they will all be
|
|
// unique.
|
|
type TokenRanges []*TokenRange
|
|
|
|
func (t TokenRanges) Len() int { return len(t) }
|
|
func (t TokenRanges) Swap(i, j int) { t[i], t[j] = t[j], t[i] }
|
|
func (t TokenRanges) Less(i, j int) bool { return t[i].Start < t[j].Start }
|
|
|
|
// CombineUnique returns the combination of both token ranges with no duplicates.
|
|
func (t TokenRanges) CombineUnique(other TokenRanges) TokenRanges {
|
|
if len(other) == 0 {
|
|
return t
|
|
}
|
|
if len(t) == 0 {
|
|
return other
|
|
}
|
|
|
|
cu := append(t, other...)
|
|
sort.Sort(cu)
|
|
|
|
if len(cu) == 0 {
|
|
return nil
|
|
}
|
|
|
|
res := TokenRanges{cu[0]}
|
|
for prev, i := cu[0], 1; i < len(cu); i++ {
|
|
if prev.Start != cu[i].Start || prev.End != cu[i].End {
|
|
res = append(res, cu[i])
|
|
prev = cu[i]
|
|
}
|
|
}
|
|
return res
|
|
}
|
|
|
|
// Hash is a map of the hashes of a section of text to the token range covering that text.
|
|
type Hash map[uint32]TokenRanges
|
|
|
|
// add associates a token range, [start, end], to a checksum.
|
|
func (h Hash) add(checksum uint32, start, end int) {
|
|
ntr := &TokenRange{Start: start, End: end}
|
|
if r, ok := h[checksum]; ok {
|
|
for _, tr := range r {
|
|
if tr.Start == ntr.Start && tr.End == ntr.End {
|
|
// The token range already exists at this
|
|
// checksum. No need to re-add it.
|
|
return
|
|
}
|
|
}
|
|
}
|
|
h[checksum] = append(h[checksum], ntr)
|
|
}
|