diff --git a/phrase/paragraph.go b/phrase/paragraph.go index 2db3465..aa74de8 100644 --- a/phrase/paragraph.go +++ b/phrase/paragraph.go @@ -17,7 +17,6 @@ import ( var ( spacesReg = regexp.MustCompile(`[\s]+`) - allowReg = regexp.MustCompile(`[a-zA-Z0-9\.,\?\!;\(\)\[\]\&\=\-_@\s]`) // Option set pinyin style args option Option = gpy.Args{ @@ -25,18 +24,20 @@ var ( Heteronym: true, } - hanSymbols = map[string]string{ - "?": "?", - "!": "!", - ":": ":", - "。": ".", - ",": ",", - ";": ";", - "(": "(", - ")": ")", - "【": "[", - "】": "]", - "、": ",", + hanSymbols = map[rune]rune{ + '?': '?', + '!': '!', + ':': ':', + '。': '.', + ',': ',', + ';': ';', + '(': '(', + ')': ')', + '【': '[', + '】': ']', + '、': ',', + '“': '"', + '”': '"', } ) @@ -66,52 +67,35 @@ func Join(a []string) (s string) { // including letters, numbers, symbols func Paragraph(p string, segs ...gse.Segmenter) (s string) { p = pinyinPhrase(p, segs...) - + var b strings.Builder + var last rune for _, r := range p { if unicode.Is(unicode.Han, r) { // Han chars + if last != 0 && !isPunctOrSymbol(last) { + b.WriteRune(' ') + } result := gpy.HanPinyin(string(r), Option) if len(result) == 0 { continue } - if len(result[0]) == 0 { continue } - - s += " " + string(result[0][0]) + " " + b.WriteString(result[0][0]) + } else if symbol, ok := hanSymbols[r]; ok { + // Han symbols + b.WriteRune(symbol) } else { - // Not han chars - char := string(r) - - if allowReg.MatchString(char) { - s += char - } else { - if hanSymbols[char] != "" { - s += hanSymbols[char] - } - } + // Ohter + b.WriteRune(r) } + last = r } + s = b.String() // trim the two continuous spaces s = spacesReg.ReplaceAllString(s, " ") - m := map[string]string{ - " ,": ",", - " .": ".", - " ?": "?", - " !": "!", - " ;": ";", - " :": ":", - " (": "(", - ") ": ")", - "[ ": "[", - " ]": "]", - } - - for k, v := range m { - s = strings.Replace(s, k, v, -1) - } s = strings.TrimSpace(s) return diff --git a/phrase/paragraph_test.go b/phrase/paragraph_test.go index ed9c632..f496b96 100644 --- a/phrase/paragraph_test.go +++ b/phrase/paragraph_test.go @@ -31,6 +31,15 @@ func TestParagraph(t *testing.T) { "香港维多利亚港": "xiang gang wei duo li ya gang", "上海外滩, 陆家嘴上海中心大厦": "shang hai wai tan, lu jia zui shang hai zhong xin da sha", "北京八达岭长城": "bei jing ba da ling chang cheng", + + "你好Golang": "ni hao Golang", + "2006年": "2006 nian", + "价格$100": "jia ge$100", + "邮箱:test@mail.com": "you xiang:test@mail.com", + "你好!": "ni hao!", + "“你好”": "\"ni hao\"", + "中文#标签": "zhong wen#biao qian", + "阅读《Astérix》": "yue du《Astérix》", } seg, err := gse.New("zh, ../examples/dict.txt") diff --git a/phrase/phrase_cut.go b/phrase/phrase_cut.go index 7b9a409..689eda2 100644 --- a/phrase/phrase_cut.go +++ b/phrase/phrase_cut.go @@ -8,6 +8,7 @@ package phrase import ( "strings" + "unicode" "github.com/go-ego/gpy" "github.com/go-ego/gse" @@ -62,10 +63,21 @@ func Match(word string) string { return match } +func isPunctOrSymbol(r rune) bool { + return unicode.IsOneOf([]*unicode.RangeTable{unicode.Punct, unicode.Symbol}, r) +} + func matchs(s, word string) string { - match := Match(word) - if match != "" { - s = strings.Replace(s, word, " "+match+" ", 1) + if match := Match(word); match != "" { + if before, after, found := strings.Cut(s, word); found { + if before := []rune(before); len(before) > 0 && !isPunctOrSymbol(before[len(before)-1]) { + match = " " + match + } + if after := []rune(after); len(after) > 0 && !isPunctOrSymbol(after[0]) { + match += " " + } + s = strings.Replace(s, word, match, 1) + } } return s diff --git a/phrase/phrase_test.go b/phrase/phrase_test.go index af05cb8..743eb56 100644 --- a/phrase/phrase_test.go +++ b/phrase/phrase_test.go @@ -18,12 +18,12 @@ func Test_pinyinPhrase(t *testing.T) { expects := map[string]string{ "西雅图太空针, The Space Nedle": "西雅图 tai kong 针, The Space Nedle", "旧金山湾金门大桥": "旧金山湾金门 da qiao", - "纽约帝国大厦, 纽约时代广场, 世贸中心": "纽约帝国 da sha , 纽约时代 guang chang , 世贸中心", + "纽约帝国大厦, 纽约时代广场, 世贸中心": "纽约帝国 da sha, 纽约时代 guang chang, 世贸中心", "多伦多加拿大国家电视塔, the CN Tower, 尼亚加拉大瀑布": "多伦多加拿 da guo jia dian 视塔, the CN Tower, 尼亚加拉大 pu bu", "伦敦泰晤士河, 大笨钟": "lun dun 泰晤士河, 大笨钟", "洛杉矶好莱坞": "洛杉矶 hao lai wu", - "东京都, 东京晴空塔": "东 jing du , 东京 qing kong 塔", + "东京都, 东京晴空塔": "东 jing du, 东京 qing kong 塔", "巴黎埃菲尔铁塔": "巴黎 ai fei er tie ta", "香港维多利亚港": "xiang gang 维多利亚港", "上海外滩, 陆家嘴上海中心大厦": "shang hai 外滩, 陆家嘴 shang hai 中心 da sha",