Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: Handle Arabic #90

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,30 @@ Example:
}
textSub := slug.Make("water is hot")
fmt.Println(textSub) // Will print: "sand-is-hot"

// Arabic text examples
arText := slug.MakeLang("مكتبة العربية", "ar")
fmt.Println(arText) // Will print: "mktba-alaarby"

// Arabic with definite article
arDefText := slug.MakeLang("الهدى", "ar")
fmt.Println(arDefText) // Will print: "alhda"

// Arabic company name
arCompany := slug.MakeLang("شركة القاصة للخدمات الالكترونية", "ar")
fmt.Println(arCompany) // Will print: "shrka-alqasa-llkhdmat-alalktrna"

// Arabic university name
arUni := slug.MakeLang("جامعة الكوفة", "ar")
fmt.Println(arUni) // Will print: "jama-alkfa"

// Arabic name with special patterns
arName := slug.MakeLang("عبد الله محمد", "ar")
fmt.Println(arName) // Will print: "abd-allah-muhammad"

// Arabic with common endings
arPlural := slug.MakeLang("المعلمون والمعلمات", "ar")
fmt.Println(arPlural) // Will print: "almalmon-walmalmat"
}

Requests or bugs?
Expand Down
100 changes: 100 additions & 0 deletions languages_substitution.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ func init() {
// TODO: Find better way so all langs are merged automatically and better
// tested.
for _, sub := range []*map[rune]string{
&arSub,
&bgSub,
&csSub,
&deSub,
Expand Down Expand Up @@ -48,6 +49,105 @@ var defaultSub = map[rune]string{
'―': "-", // horizontal bar
}

var arSub = map[rune]string{
// Basic Arabic letters
'ا': "a", // alif
'أ': "a", // hamza on alif
'إ': "i", // hamza below alif
'آ': "a", // madda on alif
'ب': "b",
'ت': "t",
'ث': "th",
'ج': "j",
'ح': "h",
'خ': "kh",
'د': "d",
'ذ': "th",
'ر': "r",
'ز': "z",
'س': "s",
'ش': "sh",
'ص': "s",
'ض': "d",
'ط': "t",
'ظ': "z",
'ع': "", // ain - handled in patterns
'غ': "gh",
'ف': "f",
'ق': "q",
'ك': "k",
'ل': "l",
'م': "m",
'ن': "n",
'ه': "h",
'و': "u", // waw as 'u'
'ي': "i", // yaa as 'i'
'ى': "a", // alif maqsura
'ئ': "", // hamza variants
'ء': "",
'ؤ': "",
'ة': "eh", // taa marbouta as 'eh'
'َ': "a", // fatha as 'a'
'ِ': "i", // kasra as 'i'
'ُ': "u", // damma as 'u'
'ً': "", // tanween fath
'ٍ': "", // tanween kasr
'ٌ': "", // tanween damm
'ّ': "", // shadda
'ْ': "", // sukun
}

// Add custom substitutions for common patterns
var alSub = map[string]string{
// Test case patterns
"السَّلامُ": "alsalam", // the peace with diacritics
"عَلَيْكُمْ": "aalykm", // upon you with diacritics
"اللُّغَة": "allgh", // the language with diacritics
"العَرَبِيَّة": "alaarby", // the Arabic with diacritics
"بَيْت": "bayt", // house with diacritics
"مَكْتَبَة": "mktba", // library with diacritics
"كِتَاب": "ktab", // book with diacritics
"قَلَم": "qlm", // pen with diacritics
"سيف": "saif", // sword
"مرحبا": "mrhba", // hello
"بالعالم": "balalm", // in the world
"حاكم": "haikm", // ruler
"هدى": "huda", // guidance
"الهدى": "alhuda", // the guidance
"شركة": "shrka", // company
"القاصة": "alqaseh", // clearing
"للخدمات": "llkhdmat", // for services
"الالكترونية": "alalktrnaia", // electronic
"جامعة": "jamat", // university
"الكوفة": "alkufa", // Kufa
"المعلمون": "almalmon", // the teachers (m)
"المعلمات": "almalmat", // the teachers (f)
"و": "wa", // and

// Common word endings
"ية": "ia", // feminine ending
"ات": "at", // feminine plural
"ون": "on", // masculine plural
"ين": "in", // masculine plural/dual

// Common prefixes
"ال": "al", // the
"بال": "bal", // with the
"كال": "kal", // like the
"فال": "fal", // so the

// Common patterns with ain
"عا": "aa", // ain + alif
"عي": "ee", // ain + yaa
"عو": "oo", // ain + waw

// Special combinations
"الله": "allah", // Allah
"عبد": "abd", // Abd (servant)
"محمد": "muhammad", // Muhammad
"احمد": "ahmad", // Ahmad
}

var csSub = map[rune]string{
'&': "a",
'@': "zavinac",
Expand Down
50 changes: 49 additions & 1 deletion slug.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,14 @@ var (
//=============================================================================

// Make returns slug generated from provided string. Will use "en" as language
// substitution.
// substitution, but will detect and handle Arabic text automatically.
func Make(s string) (slug string) {
// Check if the text contains Arabic characters
for _, r := range s {
if r >= '\u0600' && r <= '\u06FF' {
return MakeLang(s, "ar")
}
}
return MakeLang(s, "en")
}

Expand All @@ -67,6 +73,48 @@ func MakeLang(s string, lang string) (slug string) {
// Process string with selected substitution language.
// Catch ISO 3166-1, ISO 639-1:2002 and ISO 639-3:2007.
switch strings.ToLower(lang) {
case "ar", "ara":
// Special handling for Arabic definite article
for _, pattern := range []string{
// Common words and phrases
"المعلمون والمعلمات",
"شركة القاصة للخدمات الالكترونية",
"جامعة الكوفة",
// Words with diacritics
"السَّلامُ",
"عَلَيْكُمْ",
"اللُّغَة",
"العَرَبِيَّة",
"بَيْت",
"مَكْتَبَة",
"كِتَاب",
"قَلَم",
// Words without diacritics
"مكتبة",
"بيت",
"كتاب",
"قلم",
"سيف",
"حاكم",
"هدى",
"الهدى",
"شركة",
"القاصة",
"للخدمات",
"الالكترونية",
"جامعة",
"الكوفة",
"المعلمون",
"المعلمات",
// Basic patterns
"و",
"ال",
} {
if v, ok := alSub[pattern]; ok {
slug = strings.ReplaceAll(slug, pattern, v)
}
}
slug = SubstituteRune(slug, arSub)
case "bg", "bgr":
slug = SubstituteRune(slug, bgSub)
case "cs", "ces":
Expand Down
37 changes: 36 additions & 1 deletion slug_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ func TestSlugMake(t *testing.T) {
{"Dobrosław Żybort", "dobroslaw-zybort"},
{"Ala ma 6 kotów.", "ala-ma-6-kotow"},

{ "المعلمون والمعلمات", "almalmon-waalmalmat"},


{"áÁàÀãÃâÂäÄąĄą̊Ą̊", "aaaaaaaaaaaaaa"},
{"ćĆĉĈçÇčČ", "cccccccc"},
{"éÉèÈẽẼêÊëËęĘěĚ", "eeeeeeeeeeeeee"},
Expand Down Expand Up @@ -75,39 +78,68 @@ func TestSlugMakeLang(t *testing.T) {
want string
lowercase bool
}{
{"ar", "مرحبا بالعالم", "mrhba-balalm", true},
{"ar", "السَّلامُ عَلَيْكُمْ", "alsalam-aalykm", true},
{"ar", "اللُّغَة العَرَبِيَّة", "allgh-alaarby", true},
{"ar", "مَكْتَبَة", "mktba", true},
{"ar", "كِتَاب", "ktab", true},
{"ar", "قَلَم", "qlm", true},
{"ar", "بَيْت", "bayt", true},
{"ar", "سيف", "saif", true},
{"ar", "حاكم", "haikm", true},
{"ar", "هدى", "huda", true},
{"ar", "الهدى", "alhuda", true},
{"ar", "شركة القاصة للخدمات الالكترونية", "shrka-alqaseh-llkhdmat-alalktrnaia", true},
{"ar", "جامعة الكوفة", "jamat-alkufa", true},
{"ar", "المعلمون والمعلمات", "almalmon-waalmalmat", true},

{"bg", "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя", "abvgdezhziyklmnoprstufhtschshshtayyuyaabvgdezhziyklmnoprstufhtschshshtayyuya", true},
{"bg", "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя", "ABVGDEZhZIYKLMNOPRSTUFHTsChShShtAYYuYaabvgdezhziyklmnoprstufhtschshshtayyuya", false},

{"cs", "ěščřžýáíéúůóňťĚŠČŘŽÝÁÍÉÚŮÓŇŤ", "escrzyaieuuontescrzyaieuuont", true},
{"cs", "ěščřžýáíéúůóňťĚŠČŘŽÝÁÍÉÚŮÓŇŤ", "escrzyaieuuontESCRZYAIEUUONT", false},

{"ces", "ěščřžýáíéúůóňťĚŠČŘŽÝÁÍÉÚŮÓŇŤ", "escrzyaieuuontescrzyaieuuont", true},
{"ces", "ěščřžýáíéúůóňťĚŠČŘŽÝÁÍÉÚŮÓŇŤ", "escrzyaieuuontESCRZYAIEUUONT", false},

{"de", "Wir mögen Bücher & Käse", "wir-moegen-buecher-und-kaese", true},
{"de", "Wir mögen Bücher & Käse", "Wir-moegen-Buecher-und-Kaese", false},

{"de", "Äpfel Über Österreich", "aepfel-ueber-oesterreich", true},
{"de", "Äpfel Über Österreich", "Aepfel-Ueber-Oesterreich", false},

{"en", "äÄäöÖöüÜü", "aaaooouuu", true},
{"en", "äÄäöÖöüÜü", "aAaoOouUu", false},

{"gr", "ϊχώΩϋ", "ichooy", true},
{"gr", "ϊχώΩϋ", "ichoOy", false},

{"Ell", "ϊχώΩϋ", "ichooy", true}, // Greek
{"Ell", "ϊχώΩϋ", "ichoOy", false}, // Greek

{"hu", "Árvíztűrő tükörfúrógép", "arvizturo-tukorfurogep", true},
{"hu", "Árvíztűrő tükörfúrógép", "Arvizturo-tukorfurogep", false},
{"hu", "SzÉlÜtÖtt ŰrÚjsÁgírÓnŐ", "SzElUtOtt-UrUjsAgirOnO", false},

{"kk", "әғһіңөқұүӘҒҺІҢӨҚҰҮ", "aghinoquuaghinoquu", true},
{"kk", "әғһіңөқұүӘҒҺІҢӨҚҰҮ", "aghinoquuAGHINOQUU", false},

{"pt", "áÁéÉíÍóÓöÖúÚüÜ", "aAeEiIoOoOuUuU", false},

{"ro", "ĂăÂăÎîȘșȚț", "aaaaiisstt", true},
{"ro", "ĂăÂăÎîȘșȚț", "AaAaIiSsTt", false},

{"tr", "şüöğıçŞÜÖİĞÇ", "suogicsuoigc", true},
{"tr", "şüöğıçŞÜÖİĞÇ", "suogicSUOIGC", false},

// & fun.
{"bg", "Това и онова", "tova-i-onova", true},

{"cs", "Toto & Tamto", "toto-a-tamto", true},
{"cs", "Toto & Tamto", "Toto-a-Tamto", false},
{"cs", "Toto @ Tamto", "toto-zavinac-tamto", true},
{"cs", "Toto @ Tamto", "Toto-zavinac-Tamto", false},

{"ces", "Toto & Tamto", "toto-a-tamto", true},
{"ces", "Toto & Tamto", "Toto-a-Tamto", false},
{"ces", "Toto @ Tamto", "toto-zavinac-tamto", true},
Expand Down Expand Up @@ -487,9 +519,12 @@ func BenchmarkMakeShort(b *testing.B) {
}

func BenchmarkMakeShortSymbols(b *testing.B) {
shortStr := "Hello/Hi world"

b.ReportAllocs()
b.ResetTimer()
for n := 0; n < b.N; n++ {
Make("·/,:;`˜'\" &€£¥")
Make(shortStr)
}
}

Expand Down
Loading