forked from Songmu/gocredits
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatchtemplates.go
170 lines (154 loc) · 3.57 KB
/
matchtemplates.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
package gocredits
import (
"bufio"
"bytes"
"regexp"
"sort"
"strings"
"github.com/pmezard/licenses/assets"
)
type Word struct {
Text string
Pos int
}
type sortedWords []Word
func (s sortedWords) Len() int {
return len(s)
}
func (s sortedWords) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
func (s sortedWords) Less(i, j int) bool {
return s[i].Pos < s[j].Pos
}
type MatchResult struct {
Template *Template
Score float64
ExtraWords []string
MissingWords []string
}
func sortAndReturnWords(words []Word) []string {
sort.Sort(sortedWords(words))
tokens := []string{}
for _, w := range words {
tokens = append(tokens, w.Text)
}
return tokens
}
var (
reWords = regexp.MustCompile(`[\w']+`)
reCopyright = regexp.MustCompile(
`(?i)\s*Copyright (?:©|\(c\)|\xC2\xA9)?\s*(?:\d{4}|\[year\]).*`)
)
func cleanLicenseData(data []byte) []byte {
data = bytes.ToLower(data)
data = reCopyright.ReplaceAll(data, nil)
return data
}
func makeWordSet(data []byte) map[string]int {
words := map[string]int{}
data = cleanLicenseData(data)
matches := reWords.FindAll(data, -1)
for i, m := range matches {
s := string(m)
if _, ok := words[s]; !ok {
// Non-matching words are likely in the license header, to mention
// copyrights and authors. Try to preserve the initial sequences,
// to display them later.
words[s] = i
}
}
return words
}
type Template struct {
Title string
Nickname string
Words map[string]int
}
func loadTemplates() ([]*Template, error) {
templates := []*Template{}
for _, a := range assets.Assets {
templ, err := parseTemplate(a.Content)
if err != nil {
return nil, err
}
templates = append(templates, templ)
}
return templates, nil
}
// matchTemplates returns the best license template matching supplied data,
// its score between 0 and 1 and the list of words appearing in license but not
// in the matched template.
func matchTemplates(license []byte, templates []*Template) MatchResult {
bestScore := float64(-1)
var bestTemplate *Template
bestExtra := []Word{}
bestMissing := []Word{}
words := makeWordSet(license)
for _, t := range templates {
extra := []Word{}
missing := []Word{}
common := 0
for w, pos := range words {
_, ok := t.Words[w]
if ok {
common++
} else {
extra = append(extra, Word{
Text: w,
Pos: pos,
})
}
}
for w, pos := range t.Words {
if _, ok := words[w]; !ok {
missing = append(missing, Word{
Text: w,
Pos: pos,
})
}
}
score := 2 * float64(common) / (float64(len(words)) + float64(len(t.Words)))
if score > bestScore {
bestScore = score
bestTemplate = t
bestMissing = missing
bestExtra = extra
}
}
return MatchResult{
Template: bestTemplate,
Score: bestScore,
ExtraWords: sortAndReturnWords(bestExtra),
MissingWords: sortAndReturnWords(bestMissing),
}
}
func parseTemplate(content string) (*Template, error) {
t := Template{}
text := []byte{}
state := 0
scanner := bufio.NewScanner(strings.NewReader(content))
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if state == 0 {
if line == "---" {
state = 1
}
} else if state == 1 {
if line == "---" {
state = 2
} else {
if strings.HasPrefix(line, "title:") {
t.Title = strings.TrimSpace(line[len("title:"):])
} else if strings.HasPrefix(line, "nickname:") {
t.Nickname = strings.TrimSpace(line[len("nickname:"):])
}
}
} else if state == 2 {
text = append(text, scanner.Bytes()...)
text = append(text, []byte("\n")...)
}
}
t.Words = makeWordSet(text)
return &t, scanner.Err()
}