mirror of https://github.com/knative/caching.git
561 lines
17 KiB
Go
561 lines
17 KiB
Go
// Copyright 2017 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Package stringclassifier finds the nearest match between a string and a set of known values. It
|
|
// uses the Levenshtein Distance (LD) algorithm to determine this. A match with a large LD is less
|
|
// likely to be correct than one with a small LD. A confidence percentage is returned, which
|
|
// indicates how confident the algorithm is that the match is correct. The higher the percentage,
|
|
// the greater the confidence that the match is correct.
|
|
//
|
|
// Example Usage:
|
|
//
|
|
// type Text struct {
|
|
// Name string
|
|
// Text string
|
|
// }
|
|
//
|
|
// func NewClassifier(knownTexts []Text) (*stringclassifier.Classifier, error) {
|
|
// sc := stringclassifier.New(stringclassifier.FlattenWhitespace)
|
|
// for _, known := range knownTexts {
|
|
// if err := sc.AddValue(known.Name, known.Text); err != nil {
|
|
// return nil, err
|
|
// }
|
|
// }
|
|
// return sc, nil
|
|
// }
|
|
//
|
|
// func IdentifyTexts(sc *stringclassifier.Classifier, unknownTexts []*Text) {
|
|
// for _, unknown := range unknownTexts {
|
|
// m := sc.NearestMatch(unknown.Text)
|
|
// log.Printf("The nearest match to %q is %q (confidence: %v)",
|
|
// unknown.Name, m.Name, m.Confidence)
|
|
// }
|
|
// }
|
|
package stringclassifier
|
|
|
|
import (
|
|
"fmt"
|
|
"log"
|
|
"math"
|
|
"regexp"
|
|
"sort"
|
|
"sync"
|
|
|
|
"github.com/google/licenseclassifier/stringclassifier/internal/pq"
|
|
"github.com/google/licenseclassifier/stringclassifier/searchset"
|
|
"github.com/sergi/go-diff/diffmatchpatch"
|
|
)
|
|
|
|
// The diff/match/patch algorithm.
|
|
var dmp = diffmatchpatch.New()
|
|
|
|
const (
|
|
// DefaultConfidenceThreshold is the minimum ratio threshold between
|
|
// the matching range and the full source range that we're willing to
|
|
// accept in order to say that the matching range will produce a
|
|
// sufficiently good edit distance. I.e., if the matching range is
|
|
// below this threshold we won't run the Levenshtein Distance algorithm
|
|
// on it.
|
|
DefaultConfidenceThreshold float64 = 0.80
|
|
|
|
defaultMinDiffRatio float64 = 0.75
|
|
)
|
|
|
|
// A Classifier matches a string to a set of known values.
|
|
type Classifier struct {
|
|
muValues sync.RWMutex
|
|
values map[string]*knownValue
|
|
normalizers []NormalizeFunc
|
|
threshold float64
|
|
|
|
// MinDiffRatio defines the minimum ratio of the length difference
|
|
// allowed to consider a known value a possible match. This is used as
|
|
// a performance optimization to eliminate values that are unlikely to
|
|
// be a match.
|
|
//
|
|
// For example, a value of 0.75 means that the shorter string must be
|
|
// at least 75% the length of the longer string to consider it a
|
|
// possible match.
|
|
//
|
|
// Setting this to 1.0 will require that strings are identical length.
|
|
// Setting this to 0 will consider all known values as possible
|
|
// matches.
|
|
MinDiffRatio float64
|
|
}
|
|
|
|
// NormalizeFunc is a function that is used to normalize a string prior to comparison.
|
|
type NormalizeFunc func(string) string
|
|
|
|
// New creates a new Classifier with the provided NormalizeFuncs. Each
|
|
// NormalizeFunc is applied in order to a string before comparison.
|
|
func New(threshold float64, funcs ...NormalizeFunc) *Classifier {
|
|
return &Classifier{
|
|
values: make(map[string]*knownValue),
|
|
normalizers: append([]NormalizeFunc(nil), funcs...),
|
|
threshold: threshold,
|
|
MinDiffRatio: defaultMinDiffRatio,
|
|
}
|
|
}
|
|
|
|
// knownValue identifies a value in the corpus to match against.
|
|
type knownValue struct {
|
|
key string
|
|
normalizedValue string
|
|
reValue *regexp.Regexp
|
|
set *searchset.SearchSet
|
|
}
|
|
|
|
// AddValue adds a known value to be matched against. If a value already exists
|
|
// for key, an error is returned.
|
|
func (c *Classifier) AddValue(key, value string) error {
|
|
c.muValues.Lock()
|
|
defer c.muValues.Unlock()
|
|
if _, ok := c.values[key]; ok {
|
|
return fmt.Errorf("value already registered with key %q", key)
|
|
}
|
|
norm := c.normalize(value)
|
|
c.values[key] = &knownValue{
|
|
key: key,
|
|
normalizedValue: norm,
|
|
reValue: regexp.MustCompile(norm),
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// AddPrecomputedValue adds a known value to be matched against. The value has
|
|
// already been normalized and the SearchSet object deserialized, so no
|
|
// processing is necessary.
|
|
func (c *Classifier) AddPrecomputedValue(key, value string, set *searchset.SearchSet) error {
|
|
c.muValues.Lock()
|
|
defer c.muValues.Unlock()
|
|
if _, ok := c.values[key]; ok {
|
|
return fmt.Errorf("value already registered with key %q", key)
|
|
}
|
|
set.GenerateNodeList()
|
|
c.values[key] = &knownValue{
|
|
key: key,
|
|
normalizedValue: value,
|
|
reValue: regexp.MustCompile(value),
|
|
set: set,
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// normalize a string by applying each of the registered NormalizeFuncs.
|
|
func (c *Classifier) normalize(s string) string {
|
|
for _, fn := range c.normalizers {
|
|
s = fn(s)
|
|
}
|
|
return s
|
|
}
|
|
|
|
// Match identifies the result of matching a string against a knownValue.
|
|
type Match struct {
|
|
Name string // Name of knownValue that was matched
|
|
Confidence float64 // Confidence percentage
|
|
Offset int // The offset into the unknown string the match was made
|
|
Extent int // The length from the offset into the unknown string
|
|
}
|
|
|
|
// Matches is a list of Match-es. This is here mainly so that the list can be
|
|
// sorted.
|
|
type Matches []*Match
|
|
|
|
func (m Matches) Len() int { return len(m) }
|
|
func (m Matches) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
|
|
func (m Matches) Less(i, j int) bool {
|
|
if math.Abs(m[j].Confidence-m[i].Confidence) < math.SmallestNonzeroFloat64 {
|
|
if m[i].Name == m[j].Name {
|
|
if m[i].Offset > m[j].Offset {
|
|
return false
|
|
}
|
|
if m[i].Offset == m[j].Offset {
|
|
return m[i].Extent > m[j].Extent
|
|
}
|
|
return true
|
|
}
|
|
return m[i].Name < m[j].Name
|
|
}
|
|
return m[i].Confidence > m[j].Confidence
|
|
}
|
|
|
|
// Names returns an unsorted slice of the names of the matched licenses.
|
|
func (m Matches) Names() []string {
|
|
var names []string
|
|
for _, n := range m {
|
|
names = append(names, n.Name)
|
|
}
|
|
return names
|
|
}
|
|
|
|
// uniquify goes through the matches and removes any that are contained within
|
|
// one with a higher confidence. This assumes that Matches is sorted.
|
|
func (m Matches) uniquify() Matches {
|
|
type matchedRange struct {
|
|
offset, extent int
|
|
}
|
|
|
|
var matched []matchedRange
|
|
var matches Matches
|
|
OUTER:
|
|
for _, match := range m {
|
|
for _, mr := range matched {
|
|
if match.Offset >= mr.offset && match.Offset <= mr.offset+mr.extent {
|
|
continue OUTER
|
|
}
|
|
}
|
|
matched = append(matched, matchedRange{match.Offset, match.Extent})
|
|
matches = append(matches, match)
|
|
}
|
|
|
|
return matches
|
|
}
|
|
|
|
// NearestMatch returns the name of the known value that most closely matches
|
|
// the unknown string and a confidence percentage is returned indicating how
|
|
// confident the classifier is in the result. A percentage of "1.0" indicates
|
|
// an exact match, while a percentage of "0.0" indicates a complete mismatch.
|
|
//
|
|
// If the string is equidistant from multiple known values, it is undefined
|
|
// which will be returned.
|
|
func (c *Classifier) NearestMatch(s string) *Match {
|
|
pq := c.nearestMatch(s)
|
|
if pq.Len() == 0 {
|
|
return &Match{}
|
|
}
|
|
return pq.Pop().(*Match)
|
|
}
|
|
|
|
// MultipleMatch tries to determine which known strings are found within an
|
|
// unknown string. This differs from "NearestMatch" in that it looks only at
|
|
// those areas within the unknown string that are likely to match. A list of
|
|
// potential matches are returned. It's up to the caller to determine which
|
|
// ones are acceptable.
|
|
func (c *Classifier) MultipleMatch(s string) (matches Matches) {
|
|
pq := c.multipleMatch(s)
|
|
if pq == nil {
|
|
return matches
|
|
}
|
|
|
|
// A map to remove duplicate entries.
|
|
m := make(map[Match]bool)
|
|
|
|
for pq.Len() != 0 {
|
|
v := pq.Pop().(*Match)
|
|
if _, ok := m[*v]; !ok {
|
|
m[*v] = true
|
|
matches = append(matches, v)
|
|
}
|
|
}
|
|
|
|
sort.Sort(matches)
|
|
return matches.uniquify()
|
|
}
|
|
|
|
// possibleMatch identifies a known value and it's diffRatio to a given string.
|
|
type possibleMatch struct {
|
|
value *knownValue
|
|
diffRatio float64
|
|
}
|
|
|
|
// likelyMatches is a slice of possibleMatches that can be sorted by their
|
|
// diffRatio to a given string, such that the most likely matches (based on
|
|
// length) are at the beginning.
|
|
type likelyMatches []possibleMatch
|
|
|
|
func (m likelyMatches) Len() int { return len(m) }
|
|
func (m likelyMatches) Less(i, j int) bool { return m[i].diffRatio > m[j].diffRatio }
|
|
func (m likelyMatches) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
|
|
|
|
// nearestMatch returns a Queue of values that the unknown string may be. The
|
|
// values are compared via their Levenshtein Distance and ranked with the
|
|
// nearest match at the beginning.
|
|
func (c *Classifier) nearestMatch(unknown string) *pq.Queue {
|
|
var mu sync.Mutex // Protect the priority queue.
|
|
pq := pq.NewQueue(func(x, y interface{}) bool {
|
|
return x.(*Match).Confidence > y.(*Match).Confidence
|
|
}, nil)
|
|
|
|
unknown = c.normalize(unknown)
|
|
if len(unknown) == 0 {
|
|
return pq
|
|
}
|
|
|
|
c.muValues.RLock()
|
|
var likely likelyMatches
|
|
for _, v := range c.values {
|
|
dr := diffRatio(unknown, v.normalizedValue)
|
|
if dr < c.MinDiffRatio {
|
|
continue
|
|
}
|
|
if unknown == v.normalizedValue {
|
|
// We found an exact match.
|
|
pq.Push(&Match{Name: v.key, Confidence: 1.0, Offset: 0, Extent: len(unknown)})
|
|
c.muValues.RUnlock()
|
|
return pq
|
|
}
|
|
likely = append(likely, possibleMatch{value: v, diffRatio: dr})
|
|
}
|
|
c.muValues.RUnlock()
|
|
sort.Sort(likely)
|
|
|
|
var wg sync.WaitGroup
|
|
classifyString := func(name, unknown, known string) {
|
|
defer wg.Done()
|
|
|
|
diffs := dmp.DiffMain(unknown, known, true)
|
|
distance := dmp.DiffLevenshtein(diffs)
|
|
confidence := confidencePercentage(len(unknown), len(known), distance)
|
|
if confidence > 0.0 {
|
|
mu.Lock()
|
|
pq.Push(&Match{Name: name, Confidence: confidence, Offset: 0, Extent: len(unknown)})
|
|
mu.Unlock()
|
|
}
|
|
}
|
|
|
|
wg.Add(len(likely))
|
|
for _, known := range likely {
|
|
go classifyString(known.value.key, unknown, known.value.normalizedValue)
|
|
}
|
|
wg.Wait()
|
|
return pq
|
|
}
|
|
|
|
// matcher finds all potential matches of "known" in "unknown". The results are
|
|
// placed in "queue".
|
|
type matcher struct {
|
|
unknown *searchset.SearchSet
|
|
normUnknown string
|
|
threshold float64
|
|
|
|
mu sync.Mutex
|
|
queue *pq.Queue
|
|
}
|
|
|
|
// newMatcher creates a "matcher" object.
|
|
func newMatcher(unknown string, threshold float64) *matcher {
|
|
return &matcher{
|
|
unknown: searchset.New(unknown, searchset.DefaultGranularity),
|
|
normUnknown: unknown,
|
|
threshold: threshold,
|
|
queue: pq.NewQueue(func(x, y interface{}) bool {
|
|
return x.(*Match).Confidence > y.(*Match).Confidence
|
|
}, nil),
|
|
}
|
|
}
|
|
|
|
// findMatches takes a known text and finds all potential instances of it in
|
|
// the unknown text. The resulting matches can then filtered to determine which
|
|
// are the best matches.
|
|
func (m *matcher) findMatches(known *knownValue) {
|
|
var mrs []searchset.MatchRanges
|
|
if all := known.reValue.FindAllStringIndex(m.normUnknown, -1); all != nil {
|
|
// We found exact matches. Just use those!
|
|
for _, a := range all {
|
|
var start, end int
|
|
for i, tok := range m.unknown.Tokens {
|
|
if tok.Offset == a[0] {
|
|
start = i
|
|
} else if tok.Offset >= a[len(a)-1]-len(tok.Text) {
|
|
end = i
|
|
break
|
|
}
|
|
}
|
|
|
|
mrs = append(mrs, searchset.MatchRanges{{
|
|
SrcStart: 0,
|
|
SrcEnd: len(known.set.Tokens),
|
|
TargetStart: start,
|
|
TargetEnd: end + 1,
|
|
}})
|
|
}
|
|
} else {
|
|
// No exact match. Perform a more thorough match.
|
|
mrs = searchset.FindPotentialMatches(known.set, m.unknown)
|
|
}
|
|
|
|
var wg sync.WaitGroup
|
|
for _, mr := range mrs {
|
|
if !m.withinConfidenceThreshold(known.set, mr) {
|
|
continue
|
|
}
|
|
|
|
wg.Add(1)
|
|
go func(mr searchset.MatchRanges) {
|
|
start, end := mr.TargetRange(m.unknown)
|
|
conf := levDist(m.normUnknown[start:end], known.normalizedValue)
|
|
if conf > 0.0 {
|
|
m.mu.Lock()
|
|
m.queue.Push(&Match{Name: known.key, Confidence: conf, Offset: start, Extent: end - start})
|
|
m.mu.Unlock()
|
|
}
|
|
wg.Done()
|
|
}(mr)
|
|
}
|
|
wg.Wait()
|
|
}
|
|
|
|
// withinConfidenceThreshold returns the Confidence we have in the potential
|
|
// match. It does this by calculating the ratio of what's matching to the
|
|
// original known text.
|
|
func (m *matcher) withinConfidenceThreshold(known *searchset.SearchSet, mr searchset.MatchRanges) bool {
|
|
return float64(mr.Size())/float64(len(known.Tokens)) >= m.threshold
|
|
}
|
|
|
|
// multipleMatch returns a Queue of values that might be within the unknown
|
|
// string. The values are compared via their Levenshtein Distance and ranked
|
|
// with the nearest match at the beginning.
|
|
func (c *Classifier) multipleMatch(unknown string) *pq.Queue {
|
|
normUnknown := c.normalize(unknown)
|
|
if normUnknown == "" {
|
|
return nil
|
|
}
|
|
|
|
m := newMatcher(normUnknown, c.threshold)
|
|
|
|
c.muValues.RLock()
|
|
var kvals []*knownValue
|
|
for _, known := range c.values {
|
|
kvals = append(kvals, known)
|
|
}
|
|
c.muValues.RUnlock()
|
|
|
|
var wg sync.WaitGroup
|
|
wg.Add(len(kvals))
|
|
for _, known := range kvals {
|
|
go func(known *knownValue) {
|
|
if known.set == nil {
|
|
k := searchset.New(known.normalizedValue, searchset.DefaultGranularity)
|
|
c.muValues.Lock()
|
|
c.values[known.key].set = k
|
|
c.muValues.Unlock()
|
|
}
|
|
m.findMatches(known)
|
|
wg.Done()
|
|
}(known)
|
|
}
|
|
wg.Wait()
|
|
return m.queue
|
|
}
|
|
|
|
// levDist runs the Levenshtein Distance algorithm on the known and unknown
|
|
// texts to measure how well they match.
|
|
func levDist(unknown, known string) float64 {
|
|
if len(known) == 0 || len(unknown) == 0 {
|
|
log.Printf("Zero-sized texts in Levenshtein Distance algorithm: known==%d, unknown==%d", len(known), len(unknown))
|
|
return 0.0
|
|
}
|
|
|
|
// Calculate the differences between the potentially matching known
|
|
// text and the unknown text.
|
|
diffs := dmp.DiffMain(unknown, known, false)
|
|
end := diffRangeEnd(known, diffs)
|
|
|
|
// Now execute the Levenshtein Distance algorithm to see how much it
|
|
// does match.
|
|
distance := dmp.DiffLevenshtein(diffs[:end])
|
|
return confidencePercentage(unknownTextLength(unknown, diffs), len(known), distance)
|
|
}
|
|
|
|
// unknownTextLength returns the length of the unknown text based on the diff range.
|
|
func unknownTextLength(unknown string, diffs []diffmatchpatch.Diff) int {
|
|
last := len(diffs) - 1
|
|
for ; last >= 0; last-- {
|
|
if diffs[last].Type == diffmatchpatch.DiffEqual {
|
|
break
|
|
}
|
|
}
|
|
ulen := 0
|
|
for i := 0; i < last+1; i++ {
|
|
switch diffs[i].Type {
|
|
case diffmatchpatch.DiffEqual, diffmatchpatch.DiffDelete:
|
|
ulen += len(diffs[i].Text)
|
|
}
|
|
}
|
|
return ulen
|
|
}
|
|
|
|
// diffRangeEnd returns the end index for the "Diff" objects that constructs
|
|
// (or nearly constructs) the "known" value.
|
|
func diffRangeEnd(known string, diffs []diffmatchpatch.Diff) (end int) {
|
|
var seen string
|
|
for end = 0; end < len(diffs); end++ {
|
|
if seen == known {
|
|
// Once we've constructed the "known" value, then we've
|
|
// reached the point in the diff list where more
|
|
// "Diff"s would just make the Levenshtein Distance
|
|
// less valid. There shouldn't be further "DiffEqual"
|
|
// nodes, because there's nothing further to match in
|
|
// the "known" text.
|
|
break
|
|
}
|
|
switch diffs[end].Type {
|
|
case diffmatchpatch.DiffEqual, diffmatchpatch.DiffInsert:
|
|
seen += diffs[end].Text
|
|
}
|
|
}
|
|
return end
|
|
}
|
|
|
|
// confidencePercentage calculates how confident we are in the result of the
|
|
// match. A percentage of "1.0" means an identical match. A confidence of "0.0"
|
|
// means a complete mismatch.
|
|
func confidencePercentage(ulen, klen, distance int) float64 {
|
|
if ulen == 0 && klen == 0 {
|
|
return 1.0
|
|
}
|
|
if ulen == 0 || klen == 0 || (distance > ulen && distance > klen) {
|
|
return 0.0
|
|
}
|
|
return 1.0 - float64(distance)/float64(max(ulen, klen))
|
|
}
|
|
|
|
// diffRatio calculates the ratio of the length of s1 and s2, returned as a
|
|
// percentage of the length of the longer string. E.g., diffLength("abcd", "e")
|
|
// would return 0.25 because "e" is 25% of the size of "abcd". Comparing
|
|
// strings of equal length will return 1.
|
|
func diffRatio(s1, s2 string) float64 {
|
|
x, y := len(s1), len(s2)
|
|
if x == 0 && y == 0 {
|
|
// Both strings are zero length
|
|
return 1.0
|
|
}
|
|
if x < y {
|
|
return float64(x) / float64(y)
|
|
}
|
|
return float64(y) / float64(x)
|
|
}
|
|
|
|
func max(a, b int) int {
|
|
if a > b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
func min(a, b int) int {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
// wsRegexp is a regexp used to identify blocks of whitespace.
|
|
var wsRegexp = regexp.MustCompile(`\s+`)
|
|
|
|
// FlattenWhitespace will flatten contiguous blocks of whitespace down to a single space.
|
|
var FlattenWhitespace NormalizeFunc = func(s string) string {
|
|
return wsRegexp.ReplaceAllString(s, " ")
|
|
}
|