mirror of https://github.com/knative/caching.git
492 lines
16 KiB
Go
492 lines
16 KiB
Go
// Copyright 2017 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Package searchset generates hashes for all substrings of a text. Potential
|
|
// matches between two SearchSet objects can then be determined quickly.
|
|
// Generating the hashes can be expensive, so it's best to perform it once. If
|
|
// the text is part of a known corpus, then the SearchSet can be serialized and
|
|
// kept in an archive.
|
|
//
|
|
// Matching occurs by "mapping" ranges from the source text into the target
|
|
// text but still retaining the source order:
|
|
//
|
|
// SOURCE: |-----------------------------|
|
|
//
|
|
// TARGET: |*****************************************|
|
|
//
|
|
// MAP SOURCE SECTIONS ONTO TARGET IN SOURCE ORDER:
|
|
//
|
|
// S: |-[--]-----[---]------[----]------|
|
|
// / | \
|
|
// |---| |---------| |-------------|
|
|
// T: |*****************************************|
|
|
//
|
|
// Note that a single source range may match many different ranges in the
|
|
// target. The matching algorithm untangles these so that all matched ranges
|
|
// are in order with respect to the source ranges. This is especially important
|
|
// since the source text may occur more than once in the target text. The
|
|
// algorithm finds each potential occurrence of S in T and returns all as
|
|
// potential matched ranges.
|
|
package searchset
|
|
|
|
import (
|
|
"encoding/gob"
|
|
"fmt"
|
|
"io"
|
|
"sort"
|
|
|
|
"github.com/google/licenseclassifier/stringclassifier/searchset/tokenizer"
|
|
)
|
|
|
|
// DefaultGranularity is the minimum size (in words) of the hash chunks.
|
|
const DefaultGranularity = 3
|
|
|
|
// SearchSet is a set of substrings that have hashes associated with them,
|
|
// making it fast to search for potential matches.
|
|
type SearchSet struct {
|
|
// Tokens is a tokenized list of the original input string.
|
|
Tokens tokenizer.Tokens
|
|
// Hashes is a map of checksums to a range of tokens.
|
|
Hashes tokenizer.Hash
|
|
// Checksums is a list of checksums ordered from longest range to
|
|
// shortest.
|
|
Checksums []uint32
|
|
// ChecksumRanges are the token ranges for the above checksums.
|
|
ChecksumRanges tokenizer.TokenRanges
|
|
|
|
nodes []*node
|
|
}
|
|
|
|
// node consists of a range of tokens along with the checksum for those tokens.
|
|
type node struct {
|
|
checksum uint32
|
|
tokens *tokenizer.TokenRange
|
|
}
|
|
|
|
func (n *node) String() string {
|
|
return fmt.Sprintf("[%d:%d]", n.tokens.Start, n.tokens.End)
|
|
}
|
|
|
|
// New creates a new SearchSet object. It generates a hash for each substring of "s".
|
|
func New(s string, granularity int) *SearchSet {
|
|
toks := tokenizer.Tokenize(s)
|
|
|
|
// Start generating hash values for all substrings within the text.
|
|
h := make(tokenizer.Hash)
|
|
checksums, tokenRanges := toks.GenerateHashes(h, func(a, b int) int {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}(len(toks), granularity))
|
|
sset := &SearchSet{
|
|
Tokens: toks,
|
|
Hashes: h,
|
|
Checksums: checksums,
|
|
ChecksumRanges: tokenRanges,
|
|
}
|
|
sset.GenerateNodeList()
|
|
return sset
|
|
}
|
|
|
|
// GenerateNodeList creates a node list out of the search set.
|
|
func (s *SearchSet) GenerateNodeList() {
|
|
if len(s.Tokens) == 0 {
|
|
return
|
|
}
|
|
|
|
for i := 0; i < len(s.Checksums); i++ {
|
|
s.nodes = append(s.nodes, &node{
|
|
checksum: s.Checksums[i],
|
|
tokens: s.ChecksumRanges[i],
|
|
})
|
|
}
|
|
}
|
|
|
|
// Serialize emits the SearchSet out so that it can be recreated at a later
|
|
// time.
|
|
func (s *SearchSet) Serialize(w io.Writer) error {
|
|
return gob.NewEncoder(w).Encode(s)
|
|
}
|
|
|
|
// Deserialize reads a file with a serialized SearchSet in it and reconstructs it.
|
|
func Deserialize(r io.Reader, s *SearchSet) error {
|
|
if err := gob.NewDecoder(r).Decode(&s); err != nil {
|
|
return err
|
|
}
|
|
s.GenerateNodeList()
|
|
return nil
|
|
}
|
|
|
|
// MatchRange is the range within the source text that is a match to the range
|
|
// in the target text.
|
|
type MatchRange struct {
|
|
// Offsets into the source tokens.
|
|
SrcStart, SrcEnd int
|
|
// Offsets into the target tokens.
|
|
TargetStart, TargetEnd int
|
|
}
|
|
|
|
// in returns true if the start and end are enclosed in the match range.
|
|
func (m *MatchRange) in(start, end int) bool {
|
|
return start >= m.TargetStart && end <= m.TargetEnd
|
|
}
|
|
|
|
func (m *MatchRange) String() string {
|
|
return fmt.Sprintf("[%v, %v)->[%v, %v)", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd)
|
|
}
|
|
|
|
// MatchRanges is a list of "MatchRange"s. The ranges are monotonically
|
|
// increasing in value and indicate a single potential occurrence of the source
|
|
// text in the target text.
|
|
type MatchRanges []*MatchRange
|
|
|
|
func (m MatchRanges) Len() int { return len(m) }
|
|
func (m MatchRanges) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
|
|
func (m MatchRanges) Less(i, j int) bool {
|
|
if m[i].TargetStart < m[j].TargetStart {
|
|
return true
|
|
}
|
|
return m[i].TargetStart == m[j].TargetStart && m[i].SrcStart < m[j].SrcStart
|
|
}
|
|
|
|
// TargetRange is the start and stop token offsets into the target text.
|
|
func (m MatchRanges) TargetRange(target *SearchSet) (start, end int) {
|
|
start = target.Tokens[m[0].TargetStart].Offset
|
|
end = target.Tokens[m[len(m)-1].TargetEnd-1].Offset + len(target.Tokens[m[len(m)-1].TargetEnd-1].Text)
|
|
return start, end
|
|
}
|
|
|
|
// Size is the number of source tokens that were matched.
|
|
func (m MatchRanges) Size() int {
|
|
sum := 0
|
|
for _, mr := range m {
|
|
sum += mr.SrcEnd - mr.SrcStart
|
|
}
|
|
return sum
|
|
}
|
|
|
|
// FindPotentialMatches returns the ranges in the target (unknown) text that
|
|
// are best potential matches to the source (known) text.
|
|
func FindPotentialMatches(src, target *SearchSet) []MatchRanges {
|
|
matchedRanges := getMatchedRanges(src, target)
|
|
if len(matchedRanges) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// Cleanup the matching ranges so that we get the longest contiguous ranges.
|
|
for i := 0; i < len(matchedRanges); i++ {
|
|
matchedRanges[i] = coalesceMatchRanges(matchedRanges[i])
|
|
}
|
|
return matchedRanges
|
|
}
|
|
|
|
// getMatchedRanges finds the ranges in the target text that match the source
|
|
// text. There can be multiple occurrences of the source text within the target
|
|
// text. Each separate occurrence is an entry in the returned slice.
|
|
func getMatchedRanges(src, target *SearchSet) []MatchRanges {
|
|
matched := targetMatchedRanges(src, target)
|
|
if len(matched) == 0 {
|
|
return nil
|
|
}
|
|
sort.Sort(matched)
|
|
matched = untangleSourceRanges(matched)
|
|
matchedRanges := splitRanges(matched)
|
|
return mergeConsecutiveRanges(matchedRanges)
|
|
}
|
|
|
|
func extendsAny(tr tokenizer.TokenRanges, mr []MatchRanges) bool {
|
|
if len(mr) == 0 {
|
|
return false
|
|
}
|
|
for _, tv := range tr {
|
|
for _, mv := range mr {
|
|
if tv.Start >= mv[0].TargetStart && tv.Start <= mv[len(mv)-1].TargetEnd {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// targetMatchedRanges finds matching sequences in target and src ordered by target position
|
|
func targetMatchedRanges(src, target *SearchSet) MatchRanges {
|
|
if src.nodes == nil {
|
|
return nil
|
|
}
|
|
|
|
var matched MatchRanges
|
|
var previous *node
|
|
var possible []MatchRanges
|
|
for _, tgtNode := range target.nodes {
|
|
sr, ok := src.Hashes[tgtNode.checksum]
|
|
if !ok || (previous != nil && tgtNode.tokens.Start > previous.tokens.End) || !extendsAny(sr, possible) {
|
|
for _, r := range possible {
|
|
matched = append(matched, r...)
|
|
}
|
|
possible = possible[:0]
|
|
previous = nil
|
|
}
|
|
if !ok {
|
|
// There isn't a match in the source.
|
|
continue
|
|
}
|
|
|
|
// Maps index within `possible` to the slice of ranges extended by a new range
|
|
extended := make(map[int]*MatchRanges)
|
|
// Go over the set of source ranges growing lists of `possible` match ranges.
|
|
tv := tgtNode.tokens
|
|
for _, sv := range sr {
|
|
r := &MatchRange{
|
|
SrcStart: sv.Start,
|
|
SrcEnd: sv.End,
|
|
TargetStart: tv.Start,
|
|
TargetEnd: tv.End,
|
|
}
|
|
found := false
|
|
// Grow or extend each abutting `possible` match range.
|
|
for i, p := range possible {
|
|
last := p[len(p)-1]
|
|
if sv.Start >= last.SrcStart && sv.Start <= last.SrcEnd && tv.Start >= last.TargetStart && tv.Start <= last.TargetEnd {
|
|
found = true
|
|
possible[i] = append(possible[i], r)
|
|
extended[i] = &possible[i]
|
|
}
|
|
}
|
|
if !found {
|
|
// Did not abut any existing ranges, start a new `possible` match range.
|
|
mrs := make(MatchRanges, 0, 2)
|
|
mrs = append(mrs, r)
|
|
possible = append(possible, mrs)
|
|
extended[len(possible)-1] = &possible[len(possible)-1]
|
|
}
|
|
}
|
|
if len(extended) < len(possible) {
|
|
// Ranges not extended--add to `matched` if not included in other range.
|
|
for i := 0; i < len(possible); {
|
|
_, updated := extended[i]
|
|
if updated {
|
|
i++ // Keep in `possible` and advance to next index.
|
|
continue
|
|
}
|
|
p1 := possible[i]
|
|
found := false // whether found as subrange of another `possible` match.
|
|
for _, p2 := range extended {
|
|
if p1[0].SrcStart >= (*p2)[0].SrcStart && p1[0].TargetStart >= (*p2)[0].TargetStart {
|
|
found = true
|
|
break
|
|
}
|
|
}
|
|
if !found {
|
|
matched = append(matched, p1...)
|
|
} // else included in other match.
|
|
// Finished -- delete from `possible` and continue from same index.
|
|
possible = append(possible[:i], possible[i+1:]...)
|
|
}
|
|
}
|
|
previous = tgtNode
|
|
}
|
|
// At end of file, terminate all `possible` match ranges.
|
|
for i := 0; i < len(possible); i++ {
|
|
p1 := possible[i]
|
|
found := false // whether found as subrange of another `possible` match.
|
|
for j := i + 1; j < len(possible); {
|
|
p2 := possible[j]
|
|
if p1[0].SrcStart <= p2[0].SrcStart && p1[0].TargetStart <= p2[0].TargetStart {
|
|
// Delete later sub-ranges included in this range.
|
|
possible = append(possible[:j], possible[j+1:]...)
|
|
continue
|
|
}
|
|
// Skip if subrange of a later range
|
|
if p1[0].SrcStart >= p2[0].SrcStart && p1[0].TargetStart >= p2[0].TargetStart {
|
|
found = true
|
|
}
|
|
j++
|
|
}
|
|
if !found {
|
|
matched = append(matched, p1...)
|
|
}
|
|
}
|
|
return matched
|
|
}
|
|
|
|
// untangleSourceRanges goes through the ranges and removes any whose source
|
|
// ranges are "out of order". A source range is "out of order" if the source
|
|
// range is out of sequence with the source ranges before and after it. This
|
|
// happens when more than one source range maps to the same target range.
|
|
// E.g.:
|
|
//
|
|
// SrcStart: 20, SrcEnd: 30, TargetStart: 127, TargetEnd: 137
|
|
// 1: SrcStart: 12, SrcEnd: 17, TargetStart: 138, TargetEnd: 143
|
|
// 2: SrcStart: 32, SrcEnd: 37, TargetStart: 138, TargetEnd: 143
|
|
// SrcStart: 38, SrcEnd: 40, TargetStart: 144, TargetEnd: 146
|
|
//
|
|
// Here (1) is out of order, because the source range [12, 17) is out of
|
|
// sequence with the surrounding source sequences, but [32, 37) is.
|
|
func untangleSourceRanges(matched MatchRanges) MatchRanges {
|
|
mr := MatchRanges{matched[0]}
|
|
NEXT:
|
|
for i := 1; i < len(matched); i++ {
|
|
if mr[len(mr)-1].TargetStart == matched[i].TargetStart && mr[len(mr)-1].TargetEnd == matched[i].TargetEnd {
|
|
// The matched range has already been added.
|
|
continue
|
|
}
|
|
|
|
if i+1 < len(matched) && equalTargetRange(matched[i], matched[i+1]) {
|
|
// A sequence of ranges match the same target range.
|
|
// Find the first one that has a source range greater
|
|
// than the currently matched range. Omit all others.
|
|
if matched[i].SrcStart > mr[len(mr)-1].SrcStart {
|
|
mr = append(mr, matched[i])
|
|
continue
|
|
}
|
|
|
|
for j := i + 1; j < len(matched) && equalTargetRange(matched[i], matched[j]); j++ {
|
|
// Check subsequent ranges to see if we can
|
|
// find one that matches in the correct order.
|
|
if matched[j].SrcStart > mr[len(mr)-1].SrcStart {
|
|
mr = append(mr, matched[j])
|
|
i = j
|
|
continue NEXT
|
|
}
|
|
}
|
|
}
|
|
|
|
mr = append(mr, matched[i])
|
|
}
|
|
return mr
|
|
}
|
|
|
|
// equalTargetRange returns true if the two MatchRange's cover the same target range.
|
|
func equalTargetRange(this, that *MatchRange) bool {
|
|
return this.TargetStart == that.TargetStart && this.TargetEnd == that.TargetEnd
|
|
}
|
|
|
|
// splitRanges splits the matched ranges so that a single match range has a
|
|
// monotonically increasing source range (indicating a single, potential
|
|
// instance of the source in the target).
|
|
func splitRanges(matched MatchRanges) []MatchRanges {
|
|
var matchedRanges []MatchRanges
|
|
mr := MatchRanges{matched[0]}
|
|
for i := 1; i < len(matched); i++ {
|
|
if mr[len(mr)-1].SrcStart > matched[i].SrcStart {
|
|
matchedRanges = append(matchedRanges, mr)
|
|
mr = MatchRanges{matched[i]}
|
|
} else {
|
|
mr = append(mr, matched[i])
|
|
}
|
|
}
|
|
matchedRanges = append(matchedRanges, mr)
|
|
return matchedRanges
|
|
}
|
|
|
|
// mergeConsecutiveRanges goes through the matched ranges and merges
|
|
// consecutive ranges. Two ranges are consecutive if the end of the previous
|
|
// matched range and beginning of the next matched range overlap. "matched"
|
|
// should have 1 or more MatchRanges, each with one or more MatchRange objects.
|
|
func mergeConsecutiveRanges(matched []MatchRanges) []MatchRanges {
|
|
mr := []MatchRanges{matched[0]}
|
|
|
|
// Convenience functions.
|
|
prevMatchedRange := func() MatchRanges {
|
|
return mr[len(mr)-1]
|
|
}
|
|
prevMatchedRangeLastElem := func() *MatchRange {
|
|
return prevMatchedRange()[len(prevMatchedRange())-1]
|
|
}
|
|
|
|
// This algorithm compares the start of each MatchRanges object to the
|
|
// end of the previous MatchRanges object. If they overlap, then it
|
|
// tries to combine them. Note that a 0 offset into a MatchRanges
|
|
// object (e.g., matched[i][0]) is its first MatchRange, which
|
|
// indicates the start of the whole matched range.
|
|
NEXT:
|
|
for i := 1; i < len(matched); i++ {
|
|
if prevMatchedRangeLastElem().TargetEnd > matched[i][0].TargetStart {
|
|
// Consecutive matched ranges overlap. Merge them.
|
|
if prevMatchedRangeLastElem().TargetStart < matched[i][0].TargetStart {
|
|
// The last element of the previous matched
|
|
// range overlaps with the first element of the
|
|
// current matched range. Concatenate them.
|
|
if prevMatchedRangeLastElem().TargetEnd < matched[i][0].TargetEnd {
|
|
prevMatchedRangeLastElem().SrcEnd += matched[i][0].TargetEnd - prevMatchedRangeLastElem().TargetEnd
|
|
prevMatchedRangeLastElem().TargetEnd = matched[i][0].TargetEnd
|
|
}
|
|
mr[len(mr)-1] = append(prevMatchedRange(), matched[i][1:]...)
|
|
continue
|
|
}
|
|
|
|
for j := 1; j < len(matched[i]); j++ {
|
|
// Find the positions in the ranges where the
|
|
// tail end of the previous matched range
|
|
// overlaps with the start of the next matched
|
|
// range.
|
|
for k := len(prevMatchedRange()) - 1; k > 0; k-- {
|
|
if prevMatchedRange()[k].SrcStart < matched[i][j].SrcStart &&
|
|
prevMatchedRange()[k].TargetStart < matched[i][j].TargetStart {
|
|
// Append the next range to the previous range.
|
|
if prevMatchedRange()[k].TargetEnd < matched[i][j].TargetStart {
|
|
// Coalesce the ranges.
|
|
prevMatchedRange()[k].SrcEnd += matched[i][j-1].TargetEnd - prevMatchedRange()[k].TargetEnd
|
|
prevMatchedRange()[k].TargetEnd = matched[i][j-1].TargetEnd
|
|
}
|
|
mr[len(mr)-1] = append(prevMatchedRange()[:k+1], matched[i][j:]...)
|
|
continue NEXT
|
|
}
|
|
}
|
|
}
|
|
}
|
|
mr = append(mr, matched[i])
|
|
}
|
|
return mr
|
|
}
|
|
|
|
// coalesceMatchRanges coalesces overlapping match ranges into a single
|
|
// contiguous match range.
|
|
func coalesceMatchRanges(matchedRanges MatchRanges) MatchRanges {
|
|
coalesced := MatchRanges{matchedRanges[0]}
|
|
for i := 1; i < len(matchedRanges); i++ {
|
|
c := coalesced[len(coalesced)-1]
|
|
mr := matchedRanges[i]
|
|
|
|
if mr.SrcStart <= c.SrcEnd && mr.SrcStart >= c.SrcStart {
|
|
var se, ts, te int
|
|
if mr.SrcEnd > c.SrcEnd {
|
|
se = mr.SrcEnd
|
|
} else {
|
|
se = c.SrcEnd
|
|
}
|
|
if mr.TargetStart < c.TargetStart {
|
|
ts = mr.TargetStart
|
|
} else {
|
|
ts = c.TargetStart
|
|
}
|
|
if mr.TargetEnd > c.TargetEnd {
|
|
te = mr.TargetEnd
|
|
} else {
|
|
te = c.TargetEnd
|
|
}
|
|
coalesced[len(coalesced)-1] = &MatchRange{
|
|
SrcStart: c.SrcStart,
|
|
SrcEnd: se,
|
|
TargetStart: ts,
|
|
TargetEnd: te,
|
|
}
|
|
} else {
|
|
coalesced = append(coalesced, mr)
|
|
}
|
|
}
|
|
return coalesced
|
|
}
|