Skip to content

Refactor: Improve space insertion logic for Pinyin conversion #29

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 27 additions & 43 deletions phrase/paragraph.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,27 @@ import (

var (
spacesReg = regexp.MustCompile(`[\s]+`)
allowReg = regexp.MustCompile(`[a-zA-Z0-9\.,\?\!;\(\)\[\]\&\=\-_@\s]`)

// Option set pinyin style args option
Option = gpy.Args{
Style: gpy.Normal,
Heteronym: true,
}

hanSymbols = map[string]string{
"?": "?",
"!": "!",
":": ":",
"。": ".",
",": ",",
";": ";",
"(": "(",
")": ")",
"【": "[",
"】": "]",
"、": ",",
hanSymbols = map[rune]rune{
'?': '?',
'!': '!',
':': ':',
'。': '.',
',': ',',
';': ';',
'(': '(',
')': ')',
'【': '[',
'】': ']',
'、': ',',
'“': '"',
'”': '"',
}
)

Expand Down Expand Up @@ -66,52 +67,35 @@ func Join(a []string) (s string) {
// including letters, numbers, symbols
func Paragraph(p string, segs ...gse.Segmenter) (s string) {
p = pinyinPhrase(p, segs...)

var b strings.Builder
var last rune
for _, r := range p {
if unicode.Is(unicode.Han, r) {
// Han chars
if last != 0 && !isPunctOrSymbol(last) {
b.WriteRune(' ')
}
result := gpy.HanPinyin(string(r), Option)
if len(result) == 0 {
continue
}

if len(result[0]) == 0 {
continue
}

s += " " + string(result[0][0]) + " "
b.WriteString(result[0][0])
} else if symbol, ok := hanSymbols[r]; ok {
// Han symbols
b.WriteRune(symbol)
} else {
// Not han chars
char := string(r)

if allowReg.MatchString(char) {
s += char
} else {
if hanSymbols[char] != "" {
s += hanSymbols[char]
}
}
// Ohter
b.WriteRune(r)
}
last = r
}
s = b.String()

// trim the two continuous spaces
s = spacesReg.ReplaceAllString(s, " ")
m := map[string]string{
" ,": ",",
" .": ".",
" ?": "?",
" !": "!",
" ;": ";",
" :": ":",
" (": "(",
") ": ")",
"[ ": "[",
" ]": "]",
}

for k, v := range m {
s = strings.Replace(s, k, v, -1)
}

s = strings.TrimSpace(s)
return
Expand Down
9 changes: 9 additions & 0 deletions phrase/paragraph_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,15 @@ func TestParagraph(t *testing.T) {
"香港维多利亚港": "xiang gang wei duo li ya gang",
"上海外滩, 陆家嘴上海中心大厦": "shang hai wai tan, lu jia zui shang hai zhong xin da sha",
"北京八达岭长城": "bei jing ba da ling chang cheng",

"你好Golang": "ni hao Golang",
"2006年": "2006 nian",
"价格$100": "jia ge$100",
"邮箱:[email protected]": "you xiang:[email protected]",
"你好!": "ni hao!",
"“你好”": "\"ni hao\"",
"中文#标签": "zhong wen#biao qian",
"阅读《Astérix》": "yue du《Astérix》",
}

seg, err := gse.New("zh, ../examples/dict.txt")
Expand Down
18 changes: 15 additions & 3 deletions phrase/phrase_cut.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ package phrase

import (
"strings"
"unicode"

"github.com/go-ego/gpy"
"github.com/go-ego/gse"
Expand Down Expand Up @@ -62,10 +63,21 @@ func Match(word string) string {
return match
}

func isPunctOrSymbol(r rune) bool {
return unicode.IsOneOf([]*unicode.RangeTable{unicode.Punct, unicode.Symbol}, r)
}

func matchs(s, word string) string {
match := Match(word)
if match != "" {
s = strings.Replace(s, word, " "+match+" ", 1)
if match := Match(word); match != "" {
if before, after, found := strings.Cut(s, word); found {
if before := []rune(before); len(before) > 0 && !isPunctOrSymbol(before[len(before)-1]) {
match = " " + match
}
if after := []rune(after); len(after) > 0 && !isPunctOrSymbol(after[0]) {
match += " "
}
s = strings.Replace(s, word, match, 1)
}
}

return s
Expand Down
4 changes: 2 additions & 2 deletions phrase/phrase_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ func Test_pinyinPhrase(t *testing.T) {
expects := map[string]string{
"西雅图太空针, The Space Nedle": "西雅图 tai kong 针, The Space Nedle",
"旧金山湾金门大桥": "旧金山湾金门 da qiao",
"纽约帝国大厦, 纽约时代广场, 世贸中心": "纽约帝国 da sha , 纽约时代 guang chang , 世贸中心",
"纽约帝国大厦, 纽约时代广场, 世贸中心": "纽约帝国 da sha, 纽约时代 guang chang, 世贸中心",
"多伦多加拿大国家电视塔, the CN Tower, 尼亚加拉大瀑布": "多伦多加拿 da guo jia dian 视塔, the CN Tower, 尼亚加拉大 pu bu",
"伦敦泰晤士河, 大笨钟": "lun dun 泰晤士河, 大笨钟",

"洛杉矶好莱坞": "洛杉矶 hao lai wu",
"东京都, 东京晴空塔": "东 jing du , 东京 qing kong 塔",
"东京都, 东京晴空塔": "东 jing du, 东京 qing kong 塔",
"巴黎埃菲尔铁塔": "巴黎 ai fei er tie ta",
"香港维多利亚港": "xiang gang 维多利亚港",
"上海外滩, 陆家嘴上海中心大厦": "shang hai 外滩, 陆家嘴 shang hai 中心 da sha",
Expand Down