mirror of https://github.com/knative/caching.git
473 lines
14 KiB
Go
473 lines
14 KiB
Go
// Copyright 2017 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Package licenseclassifier provides methods to identify the open source
|
|
// license that most closely matches an unknown license.
|
|
package licenseclassifier
|
|
|
|
import (
|
|
"archive/tar"
|
|
"bytes"
|
|
"compress/gzip"
|
|
"fmt"
|
|
"html"
|
|
"io"
|
|
"math"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"unicode"
|
|
|
|
"github.com/google/licenseclassifier/stringclassifier"
|
|
"github.com/google/licenseclassifier/stringclassifier/searchset"
|
|
)
|
|
|
|
// DefaultConfidenceThreshold is the minimum confidence percentage we're willing to accept in order
|
|
// to say that a match is good.
|
|
const DefaultConfidenceThreshold = 0.80
|
|
|
|
var (
|
|
// Normalizers is a list of functions that get applied to the strings
|
|
// before they are registered with the string classifier.
|
|
Normalizers = []stringclassifier.NormalizeFunc{
|
|
html.UnescapeString,
|
|
removeShebangLine,
|
|
RemoveNonWords,
|
|
NormalizeEquivalentWords,
|
|
NormalizePunctuation,
|
|
strings.ToLower,
|
|
removeIgnorableTexts,
|
|
stringclassifier.FlattenWhitespace,
|
|
strings.TrimSpace,
|
|
}
|
|
|
|
// commonLicenseWords are words that are common to all known licenses.
|
|
// If an unknown text doesn't have at least one of these, then we can
|
|
// ignore it.
|
|
commonLicenseWords = []*regexp.Regexp{
|
|
regexp.MustCompile(`(?i)\bcode\b`),
|
|
regexp.MustCompile(`(?i)\blicense\b`),
|
|
regexp.MustCompile(`(?i)\boriginal\b`),
|
|
regexp.MustCompile(`(?i)\brights\b`),
|
|
regexp.MustCompile(`(?i)\bsoftware\b`),
|
|
regexp.MustCompile(`(?i)\bterms\b`),
|
|
regexp.MustCompile(`(?i)\bversion\b`),
|
|
regexp.MustCompile(`(?i)\bwork\b`),
|
|
}
|
|
)
|
|
|
|
// License is a classifier pre-loaded with known open source licenses.
|
|
type License struct {
|
|
c *stringclassifier.Classifier
|
|
|
|
// Threshold is the lowest confidence percentage acceptable for the
|
|
// classifier.
|
|
Threshold float64
|
|
|
|
// archive is a function that must return the contents of the license archive.
|
|
// When archive is nil, ReadLicenseFile(LicenseFile) is used to retrieve the
|
|
// contents.
|
|
archive func() ([]byte, error)
|
|
}
|
|
|
|
// OptionFunc set options on a License struct.
|
|
type OptionFunc func(l *License) error
|
|
|
|
// Archive is an OptionFunc to specify the location of the license archive file.
|
|
func Archive(f string) OptionFunc {
|
|
return func(l *License) error {
|
|
l.archive = func() ([]byte, error) { return ReadLicenseFile(f) }
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// ArchiveBytes is an OptionFunc that provides the contents of the license archive file.
|
|
// The caller must not overwrite the contents of b as it is not copied.
|
|
func ArchiveBytes(b []byte) OptionFunc {
|
|
return func(l *License) error {
|
|
l.archive = func() ([]byte, error) { return b, nil }
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// ArchiveFunc is an OptionFunc that provides a function that must return the contents
|
|
// of the license archive file.
|
|
func ArchiveFunc(f func() ([]byte, error)) OptionFunc {
|
|
return func(l *License) error {
|
|
l.archive = f
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// New creates a license classifier and pre-loads it with known open source licenses.
|
|
func New(threshold float64, options ...OptionFunc) (*License, error) {
|
|
classifier := &License{
|
|
c: stringclassifier.New(threshold, Normalizers...),
|
|
Threshold: threshold,
|
|
}
|
|
|
|
for _, o := range options {
|
|
err := o(classifier)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error setting option %v: %v", o, err)
|
|
}
|
|
}
|
|
|
|
if err := classifier.registerLicenses(); err != nil {
|
|
return nil, fmt.Errorf("cannot register licenses from archive: %v", err)
|
|
}
|
|
return classifier, nil
|
|
}
|
|
|
|
// NewWithForbiddenLicenses creates a license classifier and pre-loads it with
|
|
// known open source licenses which are forbidden.
|
|
func NewWithForbiddenLicenses(threshold float64, options ...OptionFunc) (*License, error) {
|
|
opts := []OptionFunc{Archive(ForbiddenLicenseArchive)}
|
|
opts = append(opts, options...)
|
|
return New(threshold, opts...)
|
|
}
|
|
|
|
// WithinConfidenceThreshold returns true if the confidence value is above or
|
|
// equal to the confidence threshold.
|
|
func (c *License) WithinConfidenceThreshold(conf float64) bool {
|
|
return conf > c.Threshold || math.Abs(conf-c.Threshold) < math.SmallestNonzeroFloat64
|
|
}
|
|
|
|
// NearestMatch returns the "nearest" match to the given set of known licenses.
|
|
// Returned are the name of the license, and a confidence percentage indicating
|
|
// how confident the classifier is in the result.
|
|
func (c *License) NearestMatch(contents string) *stringclassifier.Match {
|
|
if !c.hasCommonLicenseWords(contents) {
|
|
return nil
|
|
}
|
|
m := c.c.NearestMatch(contents)
|
|
m.Name = strings.TrimSuffix(m.Name, ".header")
|
|
return m
|
|
}
|
|
|
|
// MultipleMatch matches all licenses within an unknown text.
|
|
func (c *License) MultipleMatch(contents string, includeHeaders bool) stringclassifier.Matches {
|
|
norm := normalizeText(contents)
|
|
if !c.hasCommonLicenseWords(norm) {
|
|
return nil
|
|
}
|
|
|
|
m := make(map[stringclassifier.Match]bool)
|
|
var matches stringclassifier.Matches
|
|
for _, v := range c.c.MultipleMatch(norm) {
|
|
if !c.WithinConfidenceThreshold(v.Confidence) {
|
|
continue
|
|
}
|
|
|
|
if !includeHeaders && strings.HasSuffix(v.Name, ".header") {
|
|
continue
|
|
}
|
|
|
|
v.Name = strings.TrimSuffix(v.Name, ".header")
|
|
if re, ok := forbiddenRegexps[v.Name]; ok && !re.MatchString(norm) {
|
|
continue
|
|
}
|
|
if _, ok := m[*v]; !ok {
|
|
m[*v] = true
|
|
matches = append(matches, v)
|
|
}
|
|
}
|
|
sort.Sort(matches)
|
|
return matches
|
|
}
|
|
|
|
func normalizeText(s string) string {
|
|
for _, n := range Normalizers {
|
|
s = n(s)
|
|
}
|
|
return s
|
|
}
|
|
|
|
// hasCommonLicenseWords returns true if the unknown text has at least one word
|
|
// that's common to all licenses.
|
|
func (c *License) hasCommonLicenseWords(s string) bool {
|
|
for _, re := range commonLicenseWords {
|
|
if re.MatchString(s) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
type archivedValue struct {
|
|
name string
|
|
normalized string
|
|
set *searchset.SearchSet
|
|
}
|
|
|
|
// registerLicenses loads all known licenses and adds them to c as known values
|
|
// for comparison. The allocated space after ingesting the 'licenses.db'
|
|
// archive is ~167M.
|
|
func (c *License) registerLicenses() error {
|
|
var contents []byte
|
|
var err error
|
|
if c.archive == nil {
|
|
contents, err = ReadLicenseFile(LicenseArchive)
|
|
} else {
|
|
contents, err = c.archive()
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
reader := bytes.NewReader(contents)
|
|
gr, err := gzip.NewReader(reader)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer gr.Close()
|
|
|
|
tr := tar.NewReader(gr)
|
|
|
|
var muVals sync.Mutex
|
|
var vals []archivedValue
|
|
for i := 0; ; i++ {
|
|
hdr, err := tr.Next()
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
name := strings.TrimSuffix(hdr.Name, ".txt")
|
|
|
|
// Read normalized value.
|
|
var b bytes.Buffer
|
|
if _, err := io.Copy(&b, tr); err != nil {
|
|
return err
|
|
}
|
|
normalized := b.String()
|
|
b.Reset()
|
|
|
|
// Read precomputed hashes.
|
|
hdr, err = tr.Next()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if _, err := io.Copy(&b, tr); err != nil {
|
|
return err
|
|
}
|
|
|
|
var set searchset.SearchSet
|
|
searchset.Deserialize(&b, &set)
|
|
|
|
muVals.Lock()
|
|
vals = append(vals, archivedValue{name, normalized, &set})
|
|
muVals.Unlock()
|
|
}
|
|
|
|
for _, v := range vals {
|
|
if err = c.c.AddPrecomputedValue(v.name, v.normalized, v.set); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// endOfLicenseText is text commonly associated with the end of a license. We
|
|
// can remove text that occurs after it.
|
|
var endOfLicenseText = []string{
|
|
"END OF TERMS AND CONDITIONS",
|
|
}
|
|
|
|
// TrimExtraneousTrailingText removes text after an obvious end of the license
|
|
// and does not include substantive text of the license.
|
|
func TrimExtraneousTrailingText(s string) string {
|
|
for _, e := range endOfLicenseText {
|
|
if i := strings.LastIndex(s, e); i != -1 {
|
|
return s[:i+len(e)]
|
|
}
|
|
}
|
|
return s
|
|
}
|
|
|
|
var copyrightRE = regexp.MustCompile(`(?m)(?i:Copyright)\s+(?i:©\s+|\(c\)\s+)?(?:\d{2,4})(?:[-,]\s*\d{2,4})*,?\s*(?i:by)?\s*(.*?(?i:\s+Inc\.)?)[.,]?\s*(?i:All rights reserved\.?)?\s*$`)
|
|
|
|
// CopyrightHolder finds a copyright notification, if it exists, and returns
|
|
// the copyright holder.
|
|
func CopyrightHolder(contents string) string {
|
|
matches := copyrightRE.FindStringSubmatch(contents)
|
|
if len(matches) == 2 {
|
|
return matches[1]
|
|
}
|
|
return ""
|
|
}
|
|
|
|
var publicDomainRE = regexp.MustCompile("(?i)(this file )?is( in the)? public domain")
|
|
|
|
// HasPublicDomainNotice performs a simple regex over the contents to see if a
|
|
// public domain notice is in there. As you can imagine, this isn't 100%
|
|
// definitive, but can be useful if a license match isn't found.
|
|
func (c *License) HasPublicDomainNotice(contents string) bool {
|
|
return publicDomainRE.FindString(contents) != ""
|
|
}
|
|
|
|
// ignorableTexts is a list of lines at the start of the string we can remove
|
|
// to get a cleaner match.
|
|
var ignorableTexts = []*regexp.Regexp{
|
|
regexp.MustCompile(`(?i)^(?:the )?mit license(?: \(mit\))?$`),
|
|
regexp.MustCompile(`(?i)^(?:new )?bsd license$`),
|
|
regexp.MustCompile(`(?i)^copyright and permission notice$`),
|
|
regexp.MustCompile(`(?i)^copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]? .*$`),
|
|
regexp.MustCompile(`(?i)^(all|some) rights reserved\.?$`),
|
|
regexp.MustCompile(`(?i)^@license$`),
|
|
regexp.MustCompile(`^\s*$`),
|
|
}
|
|
|
|
// removeIgnorableTexts removes common text, which is not important for
|
|
// classification, that shows up before the body of the license.
|
|
func removeIgnorableTexts(s string) string {
|
|
lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
|
|
var start int
|
|
for ; start < len(lines); start++ {
|
|
line := strings.TrimSpace(lines[start])
|
|
var matches bool
|
|
for _, re := range ignorableTexts {
|
|
if re.MatchString(line) {
|
|
matches = true
|
|
break
|
|
}
|
|
}
|
|
if !matches {
|
|
break
|
|
}
|
|
}
|
|
end := len(lines)
|
|
if start > end {
|
|
return "\n"
|
|
}
|
|
return strings.Join(lines[start:end], "\n") + "\n"
|
|
}
|
|
|
|
// removeShebangLine removes the '#!...' line if it's the first line in the
|
|
// file. Note that if it's the only line in a comment, it won't be removed.
|
|
func removeShebangLine(s string) string {
|
|
lines := strings.Split(s, "\n")
|
|
if len(lines) <= 1 || !strings.HasPrefix(lines[0], "#!") {
|
|
return s
|
|
}
|
|
|
|
return strings.Join(lines[1:], "\n")
|
|
}
|
|
|
|
// isDecorative returns true if the line is made up purely of non-letter and
|
|
// non-digit characters.
|
|
func isDecorative(s string) bool {
|
|
for _, c := range s {
|
|
if unicode.IsLetter(c) || unicode.IsDigit(c) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
var nonWords = regexp.MustCompile("[[:punct:]]+")
|
|
|
|
// RemoveNonWords removes non-words from the string.
|
|
func RemoveNonWords(s string) string {
|
|
return nonWords.ReplaceAllString(s, " ")
|
|
}
|
|
|
|
// interchangeablePunctutation is punctuation that can be normalized.
|
|
var interchangeablePunctuation = []struct {
|
|
interchangeable *regexp.Regexp
|
|
substitute string
|
|
}{
|
|
// Hyphen, Dash, En Dash, and Em Dash.
|
|
{regexp.MustCompile(`[-‒–—]`), "-"},
|
|
// Single, Double, Curly Single, and Curly Double.
|
|
{regexp.MustCompile("['\"`‘’“”]"), "'"},
|
|
// Copyright.
|
|
{regexp.MustCompile("©"), "(c)"},
|
|
// Hyphen-separated words.
|
|
{regexp.MustCompile(`(\S)-\s+(\S)`), "${1}-${2}"},
|
|
// Currency and Section. (Different copies of the CDDL use each marker.)
|
|
{regexp.MustCompile("[§¤]"), "(s)"},
|
|
// Middle Dot
|
|
{regexp.MustCompile("·"), "*"},
|
|
}
|
|
|
|
// NormalizePunctuation takes all hyphens and quotes and normalizes them.
|
|
func NormalizePunctuation(s string) string {
|
|
for _, iw := range interchangeablePunctuation {
|
|
s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
|
|
}
|
|
return s
|
|
}
|
|
|
|
// interchangeableWords are words we can substitute for a normalized form
|
|
// without changing the meaning of the license. See
|
|
// https://spdx.org/spdx-license-list/matching-guidelines for the list.
|
|
var interchangeableWords = []struct {
|
|
interchangeable *regexp.Regexp
|
|
substitute string
|
|
}{
|
|
{regexp.MustCompile("(?i)Acknowledgment"), "Acknowledgement"},
|
|
{regexp.MustCompile("(?i)Analogue"), "Analog"},
|
|
{regexp.MustCompile("(?i)Analyse"), "Analyze"},
|
|
{regexp.MustCompile("(?i)Artefact"), "Artifact"},
|
|
{regexp.MustCompile("(?i)Authorisation"), "Authorization"},
|
|
{regexp.MustCompile("(?i)Authorised"), "Authorized"},
|
|
{regexp.MustCompile("(?i)Calibre"), "Caliber"},
|
|
{regexp.MustCompile("(?i)Cancelled"), "Canceled"},
|
|
{regexp.MustCompile("(?i)Capitalisations"), "Capitalizations"},
|
|
{regexp.MustCompile("(?i)Catalogue"), "Catalog"},
|
|
{regexp.MustCompile("(?i)Categorise"), "Categorize"},
|
|
{regexp.MustCompile("(?i)Centre"), "Center"},
|
|
{regexp.MustCompile("(?i)Emphasised"), "Emphasized"},
|
|
{regexp.MustCompile("(?i)Favour"), "Favor"},
|
|
{regexp.MustCompile("(?i)Favourite"), "Favorite"},
|
|
{regexp.MustCompile("(?i)Fulfil"), "Fulfill"},
|
|
{regexp.MustCompile("(?i)Fulfilment"), "Fulfillment"},
|
|
{regexp.MustCompile("(?i)Initialise"), "Initialize"},
|
|
{regexp.MustCompile("(?i)Judgment"), "Judgement"},
|
|
{regexp.MustCompile("(?i)Labelling"), "Labeling"},
|
|
{regexp.MustCompile("(?i)Labour"), "Labor"},
|
|
{regexp.MustCompile("(?i)Licence"), "License"},
|
|
{regexp.MustCompile("(?i)Maximise"), "Maximize"},
|
|
{regexp.MustCompile("(?i)Modelled"), "Modeled"},
|
|
{regexp.MustCompile("(?i)Modelling"), "Modeling"},
|
|
{regexp.MustCompile("(?i)Offence"), "Offense"},
|
|
{regexp.MustCompile("(?i)Optimise"), "Optimize"},
|
|
{regexp.MustCompile("(?i)Organisation"), "Organization"},
|
|
{regexp.MustCompile("(?i)Organise"), "Organize"},
|
|
{regexp.MustCompile("(?i)Practise"), "Practice"},
|
|
{regexp.MustCompile("(?i)Programme"), "Program"},
|
|
{regexp.MustCompile("(?i)Realise"), "Realize"},
|
|
{regexp.MustCompile("(?i)Recognise"), "Recognize"},
|
|
{regexp.MustCompile("(?i)Signalling"), "Signaling"},
|
|
{regexp.MustCompile("(?i)Sub[- ]license"), "Sublicense"},
|
|
{regexp.MustCompile("(?i)Utilisation"), "Utilization"},
|
|
{regexp.MustCompile("(?i)Whilst"), "While"},
|
|
{regexp.MustCompile("(?i)Wilful"), "Wilfull"},
|
|
{regexp.MustCompile("(?i)Non-commercial"), "Noncommercial"},
|
|
{regexp.MustCompile("(?i)Per cent"), "Percent"},
|
|
}
|
|
|
|
// NormalizeEquivalentWords normalizes equivalent words that are interchangeable.
|
|
func NormalizeEquivalentWords(s string) string {
|
|
for _, iw := range interchangeableWords {
|
|
s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
|
|
}
|
|
return s
|
|
}
|