From 37c157a86b2dd969720ec1bbfe21f4642f3790af Mon Sep 17 00:00:00 2001 From: adlifarizi Date: Thu, 4 Jun 2026 14:14:56 +0700 Subject: [PATCH] fix: restrict Japanese Kanji detection and remove ineffective Chinese tonal sandhi logic --- .../theveloper/pixelplay/utils/LyricsUtils.kt | 34 ++++--------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/app/src/main/java/com/theveloper/pixelplay/utils/LyricsUtils.kt b/app/src/main/java/com/theveloper/pixelplay/utils/LyricsUtils.kt index 23cff07c9..ca7dcc12c 100644 --- a/app/src/main/java/com/theveloper/pixelplay/utils/LyricsUtils.kt +++ b/app/src/main/java/com/theveloper/pixelplay/utils/LyricsUtils.kt @@ -107,12 +107,12 @@ object MultiLangRomanizer { private val KYRGYZ_SPECIFIC_CYRILLIC_LETTERS = setOf("Ң", "ң", "Ө", "ө", "Ү", "ү") private val MACEDONIAN_SPECIFIC_CYRILLIC_LETTERS = setOf("Ѓ", "ѓ", "Ѕ", "ѕ", "Ќ", "ќ") - fun isJapanese(text: String): Boolean { + fun isJapanese(text: String, entireLyricsHasKana: Boolean = false): Boolean { // Detect Hiragana or Katakana if (text.any { it in '\u3040'..'\u309F' || it in '\u30A0'..'\u30FF' }) return true - // Treat Kanji-only lines as Japanese (prioritized over Chinese in the main parsing loop) - return text.any { it in '\u4E00'..'\u9FFF' } + // Treat Kanji-only lines as Japanese only if the entire song has Japanese kana + return entireLyricsHasKana && text.any { it in '\u4E00'..'\u9FFF' } } fun isKorean(text: String) = text.any { it in '\uAC00'..'\uD7A3' } fun isHindi(text: String) = text.any { it in '\u0900'..'\u097F' } @@ -382,28 +382,6 @@ object MultiLangRomanizer { continue } - // ── Tonal sandhi: 一 (yī) ────────────────────────────────── - // Before 4th-tone syllable → yí; before 1st/2nd/3rd → yì - c == '一' -> { - val nextPinyin = next?.let { getPinyin(it, format) } ?: "" - // Pinyin4j without-tone loses tone info; we approximate: - // characters that are typically 4th tone starters - val fourthToneInitials = setOf("是","不","去","做","看","用","要","过","太","对","大","会","在","上","下","个","万","次","号") - val tone = if (next != null && fourthToneInitials.contains(next.toString())) "yi2" else "yi4" - sb.append(tone.dropLast(1)).append(" ") - idx++ - continue - } - - // ── Tonal sandhi: 不 (bù) → bú before 4th-tone syllable ── - c == '不' -> { - val fourthToneChars = setOf("是","去","做","看","用","要","过","太","对","大","会","在","上","下","个","万","次","号","不") - val isBefore4th = next != null && fourthToneChars.contains(next.toString()) - sb.append(if (isBefore4th) "bu2" else "bu4").append(" ") - idx++ - continue - } - // ── Regular Hanzi ───────────────────────────────────────── c.toString().matches(Regex("[\u4E00-\u9FA5]")) -> { sb.append(getPinyin(c, format, prevChar = chars.getOrNull(idx - 1))).append(" ") @@ -752,6 +730,8 @@ object LyricsUtils { return Lyrics(plain = emptyList(), synced = emptyList()) } + val entireLyricsHasKana = lyricsText.any { it in '\u3040'..'\u309F' || it in '\u30A0'..'\u30FF' } + val normalizedInput = stripLeadingLyricsDocumentNoise(lyricsText) if (looksLikeTtmlDocument(normalizedInput)) { val converted = TtmlLyricsParser.parseToEnhancedLrc(normalizedInput) @@ -888,7 +868,7 @@ object LyricsUtils { val pairedLines = pairTranslationLines(sortedSyncedLines).map { line -> val romanized = when { - MultiLangRomanizer.isJapanese(line.line) -> MultiLangRomanizer.romanizeJapanese(line.line) + MultiLangRomanizer.isJapanese(line.line, entireLyricsHasKana) -> MultiLangRomanizer.romanizeJapanese(line.line) MultiLangRomanizer.isChinese(line.line) -> MultiLangRomanizer.romanizeChinese(line.line) MultiLangRomanizer.isKorean(line.line) -> MultiLangRomanizer.romanizeKorean(line.line) MultiLangRomanizer.isHindi(line.line) -> MultiLangRomanizer.romanizeHindi(line.line) @@ -910,7 +890,7 @@ object LyricsUtils { } else { val processedPlain = plainLines.map { line -> val romanized = when { - MultiLangRomanizer.isJapanese(line) -> MultiLangRomanizer.romanizeJapanese(line) + MultiLangRomanizer.isJapanese(line, entireLyricsHasKana) -> MultiLangRomanizer.romanizeJapanese(line) MultiLangRomanizer.isChinese(line) -> MultiLangRomanizer.romanizeChinese(line) MultiLangRomanizer.isKorean(line) -> MultiLangRomanizer.romanizeKorean(line) MultiLangRomanizer.isHindi(line) -> MultiLangRomanizer.romanizeHindi(line)