From b041156b85e980978caa9c1fd5b2256523b31357 Mon Sep 17 00:00:00 2001 From: rauschen Date: Tue, 31 Jan 2023 16:56:36 +0100 Subject: [PATCH 01/13] [DE-DE] *.word files Manually translated with love --- numbers/src/main/resources/config/de-de/day.word | 1 + numbers/src/main/resources/config/de-de/days.word | 1 + numbers/src/main/resources/config/de-de/hour.word | 1 + numbers/src/main/resources/config/de-de/hours.word | 1 + numbers/src/main/resources/config/de-de/minute.word | 1 + numbers/src/main/resources/config/de-de/minutes.word | 1 + numbers/src/main/resources/config/de-de/second.word | 1 + numbers/src/main/resources/config/de-de/seconds.word | 1 + 8 files changed, 8 insertions(+) create mode 100644 numbers/src/main/resources/config/de-de/day.word create mode 100644 numbers/src/main/resources/config/de-de/days.word create mode 100644 numbers/src/main/resources/config/de-de/hour.word create mode 100644 numbers/src/main/resources/config/de-de/hours.word create mode 100644 numbers/src/main/resources/config/de-de/minute.word create mode 100644 numbers/src/main/resources/config/de-de/minutes.word create mode 100644 numbers/src/main/resources/config/de-de/second.word create mode 100644 numbers/src/main/resources/config/de-de/seconds.word diff --git a/numbers/src/main/resources/config/de-de/day.word b/numbers/src/main/resources/config/de-de/day.word new file mode 100644 index 00000000..1aa7c259 --- /dev/null +++ b/numbers/src/main/resources/config/de-de/day.word @@ -0,0 +1 @@ +Tag diff --git a/numbers/src/main/resources/config/de-de/days.word b/numbers/src/main/resources/config/de-de/days.word new file mode 100644 index 00000000..4c2fd37c --- /dev/null +++ b/numbers/src/main/resources/config/de-de/days.word @@ -0,0 +1 @@ +Tage diff --git a/numbers/src/main/resources/config/de-de/hour.word b/numbers/src/main/resources/config/de-de/hour.word new file mode 100644 index 00000000..7e69c570 --- /dev/null +++ b/numbers/src/main/resources/config/de-de/hour.word @@ -0,0 +1 @@ +Stunde diff --git a/numbers/src/main/resources/config/de-de/hours.word b/numbers/src/main/resources/config/de-de/hours.word new file mode 100644 index 00000000..3c728ba8 --- /dev/null +++ b/numbers/src/main/resources/config/de-de/hours.word @@ -0,0 +1 @@ +Stunden diff --git a/numbers/src/main/resources/config/de-de/minute.word b/numbers/src/main/resources/config/de-de/minute.word new file mode 100644 index 00000000..de476669 --- /dev/null +++ b/numbers/src/main/resources/config/de-de/minute.word @@ -0,0 +1 @@ +Minute diff --git a/numbers/src/main/resources/config/de-de/minutes.word b/numbers/src/main/resources/config/de-de/minutes.word new file mode 100644 index 00000000..bdc262ec --- /dev/null +++ b/numbers/src/main/resources/config/de-de/minutes.word @@ -0,0 +1 @@ +Minuten diff --git a/numbers/src/main/resources/config/de-de/second.word b/numbers/src/main/resources/config/de-de/second.word new file mode 100644 index 00000000..e658c219 --- /dev/null +++ b/numbers/src/main/resources/config/de-de/second.word @@ -0,0 +1 @@ +Sekunde diff --git a/numbers/src/main/resources/config/de-de/seconds.word b/numbers/src/main/resources/config/de-de/seconds.word new file mode 100644 index 00000000..2c54f29c --- /dev/null +++ b/numbers/src/main/resources/config/de-de/seconds.word @@ -0,0 +1 @@ +Sekunden From 126ed35504767cefbdb139ebb921cc4862264ef6 Mon Sep 17 00:00:00 2001 From: rauschen Date: Tue, 31 Jan 2023 16:57:41 +0100 Subject: [PATCH 02/13] [DE-DE] date_time.json copied from lingua franca --- .../resources/config/de-de/date_time.json | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 numbers/src/main/resources/config/de-de/date_time.json diff --git a/numbers/src/main/resources/config/de-de/date_time.json b/numbers/src/main/resources/config/de-de/date_time.json new file mode 100644 index 00000000..8a5aace2 --- /dev/null +++ b/numbers/src/main/resources/config/de-de/date_time.json @@ -0,0 +1,136 @@ +{ + "decade_format": { + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^1\\d$", "format": "{xx}"}, + "3": {"match": "^\\d0$", "format": "{x0}"}, + "4": {"match": "^[2-9]\\d$", "format": "{x} und {x0}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^1\\d{2}$", "format": "hundert"}, + "2": {"match": "^\\d{3}$", "format": "{x_in_x00} hundert"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^10\\d\\d$", "format": "tausend"}, + "2": {"match": "^\\d0\\d{2}$", "format": "{x_in_x000} tausend"}, + "3": {"match": "^1\\d00$", "format": "{xx_in_xx00} hundert"}, + "4": {"match": "^\\d{2}00$", "format": "{x_in_x000} tausend {x_in_x00} hundert"}, + "5": {"match": "^\\d0\\d\\d$", "format": "{x_in_x000} tausend"}, + "6": {"match": "^1\\d{3}$", "format": "{xx_in_xx00}"}, + "7": {"match": "^\\d{4}$", "format": "{x_in_x000} tausend {x_in_x00} hundert"}, + "default": "{number}" + }, + "year_format": { + "1": {"match": "^1$", "format": "eins {bc}"}, + "2": {"match": "^\\d{1}?$", "format": "{formatted_decade} {bc}"}, + "3": {"match": "^\\d{2}?$", "format": "{formatted_decade} {bc}"}, + "4": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, + "5": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, + "6": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, + "7": {"match": "^\\d00\\d$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "8": {"match": "^\\d{2}0\\d$", "format": "{formatted_thousand} hundert {formatted_decade} {bc}"}, + "9": {"match": "^1[2-9]\\d{2}$", "format": "{formatted_thousand} hundert {formatted_decade} {bc}"}, + "10": {"match": "^1\\d{3}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "11": {"match": "^\\d{4}$", "format": "{formatted_thousand} {formatted_decade} {bc}"}, + "default": "{year} {bc}", + "bc": "v.d.Z." + }, + "date_format": { + "date_full": "{weekday}, {day} {month}, {formatted_year}", + "date_full_no_year": "{weekday}, {day} {month}", + "date_full_no_year_month": "{weekday}, {day}", + "today": "heute", + "tomorrow": "morgen", + "yesterday": "gestern" + }, + "date_time_format": { + "date_time": "{formatted_date} um {formatted_time}" + }, + "weekday": { + "0": "Montag", + "1": "Dienstag", + "2": "Mittwoch", + "3": "Donnerstag", + "4": "Freitag", + "5": "Samstag", + "6": "Sonntag" + }, + "date": { + "1": "erster", + "2": "zweiter", + "3": "dritter", + "4": "vierter", + "5": "fünfter", + "6": "sechster", + "7": "siebter", + "8": "achter", + "9": "neunter", + "10": "zehnter", + "11": "elfter", + "12": "zwölfter", + "13": "dreizehnter", + "14": "vierzehnter", + "15": "fünfzehnter", + "16": "sechzehnter", + "17": "siebzehnter", + "18": "achtzehnter", + "19": "neunzehnter", + "20": "zwanzigster", + "21": "einundzwanzigster", + "22": "zweiundzwanzigster", + "23": "dreiundzwanzigster", + "24": "vierundzwanzigster", + "25": "fünfundzwanzigster", + "26": "sechsundzwanzigster", + "27": "siebenundzwanzigster", + "28": "achtundzwanzigster", + "29": "neunundzwanzigster", + "30": "dreißigster", + "31": "einunddreißigster" + }, + "month": { + "1": "Januar", + "2": "Februar", + "3": "März", + "4": "April", + "5": "Mai", + "6": "Juni", + "7": "Juli", + "8": "August", + "9": "September", + "10": "Oktober", + "11": "November", + "12": "Dezember" + }, + "number": { + "0": "null", + "1": "ein", + "2": "zwei", + "3": "drei", + "4": "vier", + "5": "fünf", + "6": "sechs", + "7": "sieben", + "8": "acht", + "9": "neun", + "10": "zehn", + "11": "elf", + "12": "zwölf", + "13": "dreizehn", + "14": "vierzehn", + "15": "fünfzehn", + "16": "sechzehn", + "17": "siebzehn", + "18": "achtzehn", + "19": "neunzehn", + "20": "zwanzig", + "30": "dreißig", + "40": "vierzig", + "50": "fünfzig", + "60": "sechzig", + "70": "siebzig", + "80": "achtzig", + "90": "neunzig" + } +} From 16315e357859a0b52a82be54a42b8acaf286ed97 Mon Sep 17 00:00:00 2001 From: rauschen Date: Tue, 31 Jan 2023 17:42:30 +0100 Subject: [PATCH 03/13] [DE-DE] tokenizer first draft, needs more work --- .../resources/config/de-de/tokenizer.json | 437 ++++++++++++++++++ 1 file changed, 437 insertions(+) create mode 100644 numbers/src/main/resources/config/de-de/tokenizer.json diff --git a/numbers/src/main/resources/config/de-de/tokenizer.json b/numbers/src/main/resources/config/de-de/tokenizer.json new file mode 100644 index 00000000..9277e9de --- /dev/null +++ b/numbers/src/main/resources/config/de-de/tokenizer.json @@ -0,0 +1,437 @@ +{ + "spaces": " \t\n\f\r:;_!?<>|=()[]{}»«*~^`'\"", + "characters_as_word": "%‰#-+.,/", + "raw_number_categories": [ + "number", + "raw" + ], + "plural_endings": [ + "s" + ], + "word_matches": [ + { + "categories": [ + "ignore" + ], + "values": [ + "a", + "an", + "and" + ] + }, + { + "categories": [ + "ignore", + "thousand_separator" + ], + "values": [ + "," + ] + }, + { + "categories": [ + "ordinal_suffix" + ], + "values": [ + "." + ] + }, + { + "categories": [ + "point" + ], + "values": [ + "punkt", + "." + ] + }, + { + "categories": [ + "fraction_separator" + ], + "values": [ + "over", + "geteilt", + "/" + ] + }, + { + "categories": [ + "fraction_separator_secondary" + ], + "values": [ + "durch" + ] + }, + { + "categories": [ + "sign", + "positive" + ], + "values": [ + "plus", + "+" + ] + }, + { + "categories": [ + "sign", + "negative" + ], + "values": [ + "minus" + ] + }, + { + "categories": [ + "sign", + "negative", + "ignore" + ], + "values": [ + "-" + ] + }, + { + "categories": [ + "duration_separator" + ], + "values": [ + "von" + ] + } + ], + "number_mappings": [ + { + "categories": [ + "number", + "digit", + "digit_after_point" + ], + "values": { + "null": 0, + "eins": 1, + "zwei": 2, + "drei": 3, + "vier": 4, + "fünf": 5, + "sechs": 6, + "sieben": 7, + "acht": 8, + "neun": 9 + } + }, + { + "categories": [ + "number", + "digit_after_point" + ], + "values": { + "null": 0 + } + }, + { + "categories": [ + "number", + "teen" + ], + "values": { + "zehn": 10, + "elf": 11, + "zwölf": 12, + "dreizehn": 13, + "vierzehn": 14, + "fünfzehn": 15, + "sechzehn": 16, + "siebzehn": 17, + "achtzehn": 18, + "neunzehn": 19 + } + }, + { + "categories": [ + "number", + "tens" + ], + "values": { + "zwanzig": 20, + "dreißig": 30, + "vierzig": 40, + "fünfzig": 50, + "sechzig": 60, + "siebzig": 70, + "achtzig": 80, + "neunzig": 90 + } + }, + { + "categories": [ + "number", + "hundred" + ], + "values": { + "hundert": 100 + } + }, + { + "categories": [ + "number", + "multiplier" + ], + "values": { + "tausend": 1000, + "million": 1000000, + "milliarde": 1000000000, + "billion": 1000000000000, + "billiarde": 1000000000000000, + "trillion": 1000000000000000000 + } + }, + { + "categories": [ + "number", + "ordinal", + "digit" + ], + "values": { + "erster": 1, + "erste": 1, + "erstes": 1, + "zweiter": 2, + "zweite": 2, + "zweites": 2, + "dritter": 3, + "dritte": 3, + "drittes": 3, + "vierter": 4, + "vierte": 4, + "viertes": 4, + "fünfter": 5, + "fünfte": 5, + "fünftes": 5, + "sechster": 6, + "sechste": 6, + "sechstes": 6, + "siebter": 7, + "siebte": 7, + "siebtes": 7, + "achter": 8, + "achte": 8, + "achtes": 8, + "neunter": 9 + "neunte": 9 + "neuntes": 9 + } + }, + { + "categories": [ + "number", + "ordinal", + "teen" + ], + "values": { + "zehnter": 10, + "zehnte": 10, + "zehntes": 10, + "elfter": 11, + "elfte": 11, + "elftes": 11, + "zwölfter": 12, + "zwölfte": 12, + "zwölftes": 12, + "dreizehnter": 13, + "dreizehnte": 13, + "dreizehntes": 13, + "vierzehnter": 14, + "vierzehnte": 14, + "vierzehntes": 14, + "fünfzehnter": 15, + "fünfzehnte": 15, + "fünfzehntes": 15, + "sechzehnter": 16, + "sechzehnte": 16, + "sechzehntes": 16, + "siebzehnter": 17, + "siebzehnte": 17, + "siebzehntes": 17, + "achtzehnter": 18, + "achtzehnte": 18, + "achtzehntes": 18, + "neunzehnter": 19, + "neunzehnte": 19, + "neunzehntes": 19 + } + }, + { + "categories": [ + "number", + "ordinal", + "tens" + ], + "values": { + "zwanzigster": 20, + "zwanzigste": 20, + "zwanzigstes": 20, + "dreißigster": 30, + "dreißigste": 30, + "dreißigstes": 30, + "vierzigster": 40, + "vierzigste": 40, + "vierzigstes": 40, + "fünfzigster": 50, + "fünfzigste": 50, + "fünfzigstes": 50, + "sechzigster": 60, + "sechzigste": 60, + "sechzigstes": 60, + "siebzigster": 70, + "siebzigste": 70, + "siebzigstes": 70, + "achtzigster": 80, + "achtzigste": 80, + "achtzigstes": 80, + "neunzigster": 90, + "neunzigste": 90, + "neunzigstes": 90 + } + }, + { + "categories": [ + "number", + "ordinal", + "hundred" + ], + "values": { + "hundertster": 100, + "hundertste": 100, + "hundertstes": 100 + } + }, + { + "categories": [ + "number", + "ordinal", + "multiplier" + ], + "values": { + "tausendster": 1000, + "tausendste": 1000, + "tausendstes": 1000, + "millionster": 1000000, + "millionste": 1000000, + "millionstes": 1000000, + "milliardster": 1000000000, + "milliardste": 1000000000, + "milliardstes": 1000000000, + "billionster": 1000000000000, + "billionste": 1000000000000, + "billionster": 1000000000000, + "billiardster": 1000000000000000, + "billiardste": 1000000000000000, + "billiardstes": 1000000000000000, + "trillionster": 1000000000000000000, + "trillionste": 1000000000000000000, + "trillionstes": 1000000000000000000 + } + }, + { + "categories": [ + "number", + "suffix_multiplier" + ], + "values": { + "halb": 0.5, + "viertel": 0.25, + "paar": 2, + "dutzend": 12, + "prozent": 0.01, + "%": 0.01, + "promille": 0.001, + "promill": 0.001, + "‰": 0.001 + } + } + ], + "duration_words": { + "1 NANOS": [ + "Nanosekunde", + "Nanosekunden", + "ns" + ], + "1 MICROS": [ + "Mikrosekunde", + "Mikrosekunden", + "μs" + ], + "1 MILLIS": [ + "Millisekunden", + "Millisekunden", + "ms" + ], + "1 SECONDS": [ + "Sekunde", + "Sekunden", + "s", + ], + "1 MINUTES": [ + "Minute", + "Minuten", + "m", + "Min" + ], + "1 HOURS": [ + "Stunde", + "Stunden", + "h" + ], + "1 DAYS": [ + "Tag", + "Tage" + "T" + ], + "1 WEEKS": [ + "Woche", + "Wochen", + "W" + ], + "1 MONTHS": [ + "Monat", + "Monate", + "Mon" + ], + "1 YEARS": [ + "Jahr", + "Jahre", + "J" + ], + "1 DECADES": [ + "Jahrzehnt", + "Jahrzehnte", + "Decade", + "Decaden" + ], + "1 CENTURIES": [ + "Jahrhundert", + "Jahrhunderte" + ], + "1 MILLENNIA": [ + "Millenium", + "Millenien", + "Jahrtausend", + "Jahrtausende" + "Millenia" + ] + }, + "duration_restrict_after_number": [ + "ns", + "μs", + "ms", + "s", + "m", + "h", + "d", + "w", + "mo", + "yr" + ] +} From 558bc1bbd84276711535942ba369871e84daf7ec Mon Sep 17 00:00:00 2001 From: rauschen Date: Tue, 31 Jan 2023 17:49:45 +0100 Subject: [PATCH 04/13] [DE-DE] date_time_test copied from lingua-franca https://github.com/MycroftAI/lingua-franca/tree/master/lingua_franca/res/text/de-de --- .../config/de-de/date_time_test.json | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 numbers/src/test/resources/config/de-de/date_time_test.json diff --git a/numbers/src/test/resources/config/de-de/date_time_test.json b/numbers/src/test/resources/config/de-de/date_time_test.json new file mode 100644 index 00000000..1bea06f6 --- /dev/null +++ b/numbers/src/test/resources/config/de-de/date_time_test.json @@ -0,0 +1,43 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "eins v.d.Z." }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "zehn v.d.Z." }, + "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "zwei und neunzig v.d.Z." }, + "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "acht hundert drei" }, + "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "acht hundert elf" }, + "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "vier hundert vier und fünfzig" }, + "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tausend fünf" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tausend zwölf" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "tausend sechs und vierzig" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "achtzehn hundert sieben" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "siebzehn hundert siebzehn" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "neunzehn hundert acht und achtzig"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zwei tausend neun"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zwei tausend achtzehn"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zwei tausend ein und zwanzig"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zwei tausend dreißig"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "zwei tausend ein hundert" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "tausend" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "zwei tausend" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "drei tausend ein hundert zwanzig v.d.Z." }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "drei tausend zwei hundert ein und vierzig v.d.Z." }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "fünf tausend zwei hundert" }, + "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "elf hundert" }, + "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "zwei tausend ein hundert" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "Dienstag, einunddreißigster Januar, zwei tausend siebzehn"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "Sonntag, vierter Februar, zwei tausend achtzehn"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "Sonntag, vierter Februar"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "Sonntag, vierter"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "morgen"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "heute"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "gestern"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "Sonntag, vierter Februar"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "Sonntag, vierter Februar, zwei tausend achtzehn"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "Dienstag, einunddreißigster Januar, zwei tausend siebzehn um ein Uhr zweiundzwanzig nachmittags"}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "Dienstag, einunddreißigster Januar, zwei tausend siebzehn um dreizehn Uhr zweiundzwanzig"} + } +} From 3edb0b665b66b92f629e771307a5455a3609faf3 Mon Sep 17 00:00:00 2001 From: rauschen Date: Tue, 31 Jan 2023 18:59:23 +0100 Subject: [PATCH 05/13] [DE-DE] Some adjustments to tokenizer --- .../resources/config/de-de/tokenizer.json | 92 ++++++++++++------- 1 file changed, 58 insertions(+), 34 deletions(-) diff --git a/numbers/src/main/resources/config/de-de/tokenizer.json b/numbers/src/main/resources/config/de-de/tokenizer.json index 9277e9de..e475322e 100644 --- a/numbers/src/main/resources/config/de-de/tokenizer.json +++ b/numbers/src/main/resources/config/de-de/tokenizer.json @@ -6,7 +6,7 @@ "raw" ], "plural_endings": [ - "s" + "er" ], "word_matches": [ { @@ -14,9 +14,7 @@ "ignore" ], "values": [ - "a", - "an", - "and" + "und" ] }, { @@ -25,7 +23,7 @@ "thousand_separator" ], "values": [ - "," + "." ] }, { @@ -33,6 +31,9 @@ "ordinal_suffix" ], "values": [ + "ter", + "te", + "tes", "." ] }, @@ -42,6 +43,18 @@ ], "values": [ "punkt", + "komma", + ".", + "," + ] + }, + { + "categories": [ + "point", + "ignore" + ], + "values": [ + ",", "." ] }, @@ -50,7 +63,6 @@ "fraction_separator" ], "values": [ - "over", "geteilt", "/" ] @@ -79,16 +91,7 @@ "negative" ], "values": [ - "minus" - ] - }, - { - "categories": [ - "sign", - "negative", - "ignore" - ], - "values": [ + "minus", "-" ] }, @@ -341,6 +344,7 @@ ], "values": { "halb": 0.5, + "halbe": 0.5, "viertel": 0.25, "paar": 2, "dutzend": 12, @@ -372,54 +376,74 @@ "Sekunde", "Sekunden", "s", + "Sek.", + "sec" ], "1 MINUTES": [ "Minute", "Minuten", - "m", - "Min" + "min", + "Min.", + "m" ], "1 HOURS": [ "Stunde", "Stunden", - "h" + "h", + "Std." ], "1 DAYS": [ "Tag", "Tage" - "T" + "T.", + "d", + "Tg." ], "1 WEEKS": [ "Woche", "Wochen", - "W" + "W.", + "Wo." ], "1 MONTHS": [ "Monat", "Monate", - "Mon" + "Mon.", + "M.", + "Mt.", + "Mo." ], "1 YEARS": [ "Jahr", "Jahre", - "J" + "J.", + "Jr.", + "y", + "a" ], "1 DECADES": [ "Jahrzehnt", "Jahrzehnte", - "Decade", - "Decaden" + "Dekade", + "Dekaden" ], "1 CENTURIES": [ "Jahrhundert", - "Jahrhunderte" + "Jahrhunderte", + "Jh.", + "Jhdt.", + "Jahrh." ], "1 MILLENNIA": [ - "Millenium", - "Millenien", "Jahrtausend", - "Jahrtausende" - "Millenia" + "Jahrtausende", + "Millennium", + "Millennien", + "Jt.", + "Jtsd.", + "Jahrt.", + "Jhtsd.", + "Jhtsde." ] }, "duration_restrict_after_number": [ @@ -429,9 +453,9 @@ "s", "m", "h", - "d", - "w", - "mo", - "yr" + "T", + "W", + "M", + "J" ] } From b65194a4c594ab35403b38a5de3ff94d0980d701 Mon Sep 17 00:00:00 2001 From: rauschen Date: Mon, 6 Feb 2023 19:04:15 +0100 Subject: [PATCH 06/13] [de-de] GermanParser.java first version --- .../dicio/numbers/lang/de/GermanParser.java | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 numbers/src/main/java/org/dicio/numbers/lang/de/GermanParser.java diff --git a/numbers/src/main/java/org/dicio/numbers/lang/de/GermanParser.java b/numbers/src/main/java/org/dicio/numbers/lang/de/GermanParser.java new file mode 100644 index 00000000..0573b1a7 --- /dev/null +++ b/numbers/src/main/java/org/dicio/numbers/lang/de/GermanParser.java @@ -0,0 +1,42 @@ +package org.dicio.numbers.lang.de; + +import org.dicio.numbers.parser.NumberParser; +import org.dicio.numbers.parser.lexer.TokenStream; +import org.dicio.numbers.util.DurationExtractorUtils; + +import java.time.Duration; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.util.List; + +public class GermanParser extends NumberParser { + + public GermanParser() { + super("config/de-de"); + } + + + @Override + public List extractNumbers(final String utterance, + final boolean shortScale, + final boolean preferOrdinal) { + return new GermanNumberExtractor(new TokenStream(tokenizer.tokenize(utterance)), + shortScale, preferOrdinal).extractNumbers(); + } + + @Override + public Duration extractDuration(final String utterance, final boolean shortScale) { + final TokenStream tokenStream = new TokenStream(tokenizer.tokenize(utterance)); + final GermanNumberExtractor numberExtractor + = new GermanNumberExtractor(tokenStream, shortScale, false); + return DurationExtractorUtils.extractDuration(tokenStream, + numberExtractor::extractOneNumberNoOrdinal); + } + + @Override + public LocalDateTime extractDateTime(final String utterance, + final boolean anchorDate, + final LocalTime defaultTime) { + return null; + } +} From dbbdb87e7b6d54045dfe7631c4b08fea5e208846 Mon Sep 17 00:00:00 2001 From: rauschen Date: Mon, 6 Feb 2023 21:15:31 +0100 Subject: [PATCH 07/13] [de-de] GermanFormater.java part 1 --- .../numbers/lang/de/GermanFormatter.java | 455 ++++++++++++++++++ 1 file changed, 455 insertions(+) create mode 100644 numbers/src/main/java/org/dicio/numbers/lang/de/GermanFormatter.java diff --git a/numbers/src/main/java/org/dicio/numbers/lang/de/GermanFormatter.java b/numbers/src/main/java/org/dicio/numbers/lang/de/GermanFormatter.java new file mode 100644 index 00000000..5e905929 --- /dev/null +++ b/numbers/src/main/java/org/dicio/numbers/lang/de/GermanFormatter.java @@ -0,0 +1,455 @@ +package org.dicio.numbers.lang.de; + +import org.dicio.numbers.formatter.NumberFormatter; +import org.dicio.numbers.util.MixedFraction; +import org.dicio.numbers.util.Utils; + +import java.time.LocalTime; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +public class GermanFormatter extends NumberFormatter { + + final Map NUMBER_NAMES = new HashMap() {{ + put(0L, "null"); + put(1L, "eins"); + put(2L, "zwei"); + put(3L, "drei"); + put(4L, "vier"); + put(5L, "fünf"); + put(6L, "sechs"); + put(7L, "sieben"); + put(8L, "acht"); + put(9L, "neun"); + put(10L, "zehn"); + put(11L, "elf"); + put(12L, "zwölf"); + put(13L, "dreizehn"); + put(14L, "vierzehn"); + put(15L, "fünfzehn"); + put(16L, "sechzehn"); + put(17L, "siebzehn"); + put(18L, "achtzehn"); + put(19L, "neunzehn"); + put(20L, "zwanzig"); + put(30L, "dreißig"); + put(40L, "vierzig"); + put(50L, "fünfzig"); + put(60L, "sechzig"); + put(70L, "siebzig"); + put(80L, "achtzig"); + put(90L, "neunzig"); + put(100L, "hundert"); + put(1000L, "tausend"); + put(1000000L, "million"); + put(1000000000L, "milliarde"); + put(1000000000000L, "billion"); + put(1000000000000000L, "billiarde"); + put(1000000000000000000L, "trillion"); + }}; + + final Map ORDINAL_NAMES = new HashMap() {{ + put(1L, "erste"); + put(2L, "zweite"); + put(3L, "dritte"); + put(4L, "vierte"); + put(5L, "fünfte"); + put(6L, "sechste"); + put(7L, "siebte"); + put(8L, "achte"); + put(9L, "neunte"); + put(10L, "zehnte"); + put(11L, "elfte"); + put(12L, "zwölfte"); + put(13L, "dreizehnte"); + put(14L, "vierzehnte"); + put(15L, "fünfzehnte"); + put(16L, "sechzehnte"); + put(17L, "siebzehnte"); + put(18L, "achtzehnte"); + put(19L, "neunzehnte"); + put(20L, "zwanzigste"); + put(30L, "dreißigste"); + put(40L, "vierzigste"); + put(50L, "fünfzigste"); + put(60L, "sechzigste"); + put(70L, "siebzigste"); + put(80L, "achtzigste"); + put(90L, "neunzigste"); + put(100L, "hundertste"); + put(1000L, "tausendste"); + put(1000000L, "millionste"); + put(1000000000L, "milliardste"); + put(1000000000000L, "billionste"); + put(1000000000000000L, "billiardste"); + put(1000000000000000000L, "trilliardste"); + }}; + + public GermanFormatter() { + super("config/de-de"); + } + + + + /** + * Format a number to a pronounceable representation. For example, -4000619 would be formatted + * into "minus four million, six hundred and nineteen" for English. + * + * @param number the number to pronounce + * @param places the number of decimal places to round decimal numbers to + * @param shortScale use short (true) or long (false) scale for large numbers (see + * + * Names of large numbers) + * @param scientific if true convert and pronounce in scientific notation + * @param ordinal if true pronounce in the ordinal form (e.g. "first" instead of "one" for + * English) + * @return the formatted number as a string + * + public abstract String pronounceNumber(double number, + int places, + boolean shortScale, + boolean scientific, + boolean ordinal); + +*/ + + @Override + public String niceNumber(final MixedFraction mixedFraction, final boolean speech) { + if (speech) { + final String sign = mixedFraction.negative ? "minus " : ""; + if (mixedFraction.numerator == 0) { + return sign + pronounceNumber(mixedFraction.whole, 0, true, false, false); + } + + String denominatorString; + if (mixedFraction.denominator == 1) { + denominatorString = "Eintel"; + } else if (mixedFraction.denominator == 2) { + denominatorString = "Halbe"; + } else if (mixedFraction.denominator == 3) { + denominatorString = "Drittel"; + } else if (mixedFraction.denominator == 7) { + denominatorString = "Siebtel"; + } else if (mixedFraction.denominator << 20) { + // below 20 use number name + suffix "tel" + denominatorString + = pronounceNumber(mixedFraction.denominator, 0, true, false, true) + "tel"; + } else { + // for 20+ use number name + suffix "stel" + denominatorString + = pronounceNumber(mixedFraction.denominator, 0, true, false, true) + "stel"; + } + + final String numeratorString; + numeratorString = pronounceNumber(mixedFraction.numerator, 0, true, false, false); + + if (mixedFraction.whole == 0) { + return sign + numeratorString + " " + denominatorString; + } else { + return sign + pronounceNumber(mixedFraction.whole, 0, true, false, false) + + " und " + numeratorString + " " + denominatorString; + } + + } else { + return niceNumberNotSpeech(mixedFraction); + } + } + + @Override + public String pronounceNumber(double number, + final int places, + final boolean shortScale, + final boolean scientific, + final boolean ordinal) { + + if (number == Double.POSITIVE_INFINITY) { + return "unendlich"; + } else if (number == Double.NEGATIVE_INFINITY) { + return "minus unendlich"; + } else if (Double.isNaN(number)) { + return "keine Zahl"; + } + + // also using scientific mode if the number is too big to be spoken fully. Checking against + // the biggest double smaller than 10^21 = 1000 * 10^18, which is the biggest pronounceable + // number, since e.g. 999.99 * 10^18 can be pronounced correctly. + if (scientific || Math.abs(number) > 999999999999999934463d) { + final String scientificFormatted = String.format(Locale.ENGLISH, "%E", number); + final String[] parts = scientificFormatted.split("E", 2); + final double power = Integer.parseInt(parts[1]); + + if (power != 0) { + // This handles negatives of powers separately from the normal + // handling since each call disables the scientific flag + final double n = Double.parseDouble(parts[0]); + return String.format("%s mal zehn hoch %s", + pronounceNumber(Math.abs(n), places, shortScale, false, false), + pronounceNumber(Math.abs(power), places, shortScale, false, false)); + } + } + + final StringBuilder result = new StringBuilder(); + if (number < 0) { + number = -number; + // from here on number is always positive + if (places != 0 || number >= 0.5) { + // do not add minus if number will be rounded to 0 + result.append(scientific ? "negative " : "minus "); + } + } + + final int realPlaces = Utils.decimalPlacesNoFinalZeros(number, places); + final boolean numberIsWhole = realPlaces == 0; + // if no decimal places to be printed, numberLong should be the rounded number + final long numberLong = (long) number + (number % 1 >= 0.5 && numberIsWhole ? 1 : 0); + + if (!ordinal && numberIsWhole && numberLong > 1000 && numberLong < 2000) { + // deal with 4 digits that can be said like a date, i.e. 1972 => nineteen seventy two + + result.append(NUMBER_NAMES.get(numberLong / 100)); + result.append(" "); + if (numberLong % 100 == 0) { + // 1900 => nineteen hundred + result.append(NUMBER_NAMES.get(100L)); + } else if (numberLong % 100 < 10 && numberLong % 100 != 0) { + // 1906 => nineteen oh six + result.append("oh "); + result.append(NUMBER_NAMES.get(numberLong % 10)); + } else if (numberLong % 10 == 0 || numberLong % 100 < 20) { + // 1960 => nineteen sixty; 1911 => nineteen eleven + result.append(NUMBER_NAMES.get(numberLong % 100)); + } else { + // 1961 => nineteen sixty one + result.append(NUMBER_NAMES.get(numberLong % 100 - numberLong % 10)); + result.append(" "); + result.append(NUMBER_NAMES.get(numberLong % 10)); + } + + return result.toString(); + } + + if (!ordinal && NUMBER_NAMES.containsKey(numberLong)) { + if (number > 90) { + result.append("one "); + } + result.append(NUMBER_NAMES.get(numberLong)); + + } else if (shortScale) { + boolean ordi = ordinal && numberIsWhole; // not ordinal if not whole + final List groups = Utils.splitByModulus(numberLong, 1000); + final List groupNames = new ArrayList<>(); + for (int i = 0; i < groups.size(); ++i) { + final long z = groups.get(i); + if (z == 0) { + continue; // skip 000 groups + } + String groupName = subThousand(z, i == 0 && ordi); + + if (i != 0) { + final long magnitude = Utils.longPow(1000, i); + if (ordi) { + // ordi can be true only for the first group (i.e. at the end of the number) + if (z == 1) { + // remove "one" from first group (e.g. "one billion, millionth") + groupName = ORDINAL_NAMES_SHORT_SCALE.get(magnitude); + } else { + groupName += " " + ORDINAL_NAMES_SHORT_SCALE.get(magnitude); + } + } else { + groupName += " " + NUMBER_NAMES_SHORT_SCALE.get(magnitude); + } + } + + groupNames.add(groupName); + ordi = false; + } + + appendSplitGroups(result, groupNames); + + } else { + boolean ordi = ordinal && numberIsWhole; // not ordinal if not whole + final List groups = Utils.splitByModulus(numberLong, 1000000); + final List groupNames = new ArrayList<>(); + for (int i = 0; i < groups.size(); ++i) { + final long z = groups.get(i); + if (z == 0) { + continue; // skip 000000 groups + } + + String groupName; + if (z < 1000) { + groupName = subThousand(z, i == 0 && ordi); + } else { + groupName = subThousand(z / 1000, false) + " thousand"; + if (z % 1000 != 0) { + groupName += (i == 0 ? ", " : " ") + subThousand(z % 1000, i == 0 && ordi); + } else if (i == 0 && ordi) { + if (z / 1000 == 1) { + groupName = "thousandth"; // remove "one" from "one thousandth" + } else { + groupName += "th"; + } + } + } + + if (i != 0) { + final long magnitude = Utils.longPow(1000000, i); + if (ordi) { + // ordi can be true only for the first group (i.e. at the end of the number) + if (z == 1) { + // remove "one" from first group (e.g. "one billion, millionth") + groupName = ORDINAL_NAMES_LONG_SCALE.get(magnitude); + } else { + groupName += " " + ORDINAL_NAMES_LONG_SCALE.get(magnitude); + } + } else { + groupName += " " + NUMBER_NAMES_LONG_SCALE.get(magnitude); + } + } + + groupNames.add(groupName); + ordi = false; + } + + appendSplitGroups(result, groupNames); + } + + if (realPlaces > 0) { + if (number < 1.0 && (result.length() == 0 || "minus ".contentEquals(result))) { + result.append("zero"); // nothing was written before + } + result.append(" point"); + + final String fractionalPart = String.format("%." + realPlaces + "f", number % 1); + for (int i = 2; i < fractionalPart.length(); ++i) { + result.append(" "); + result.append(NUMBER_NAMES.get((long) (fractionalPart.charAt(i) - '0'))); + } + } + + return result.toString(); + } + + @Override + public String niceTime(final LocalTime time, + final boolean speech, + final boolean use24Hour, + final boolean showAmPm) { + if (speech) { + if (use24Hour) { + final StringBuilder result = new StringBuilder(); + if (time.getHour() < 10) { + result.append("zero "); + } + result.append(pronounceNumberDuration(time.getHour())); + + result.append(" "); + if (time.getMinute() == 0) { + result.append("hundred"); + } else { + if (time.getMinute() < 10) { + result.append("zero "); + } + result.append(pronounceNumberDuration(time.getMinute())); + } + + return result.toString(); + } else { + if (time.getHour() == 0 && time.getMinute() == 0) { + return "midnight"; + } else if (time.getHour() == 12 && time.getMinute() == 0) { + return "noon"; + } + + final int normalizedHour = (time.getHour() + 11) % 12 + 1; // 1 to 12 + final StringBuilder result = new StringBuilder(); + if (time.getMinute() == 15) { + result.append("quarter past "); + result.append(pronounceNumberDuration(normalizedHour)); + } else if (time.getMinute() == 30) { + result.append("half past "); + result.append(pronounceNumberDuration(normalizedHour)); + } else if (time.getMinute() == 45) { + result.append("quarter to "); + result.append(pronounceNumberDuration(normalizedHour % 12 + 1)); + } else { + result.append(pronounceNumberDuration(normalizedHour)); + + if (time.getMinute() == 0) { + if (!showAmPm) { + return result + " o'clock"; + } + } else { + if (time.getMinute() < 10) { + result.append(" oh"); + } + result.append(" "); + result.append(pronounceNumberDuration(time.getMinute())); + } + } + + if (showAmPm) { + result.append(time.getHour() >= 12 ? " p.m." : " a.m."); + } + return result.toString(); + } + + } else { + if (use24Hour) { + return time.format(DateTimeFormatter.ofPattern("HH:mm", Locale.ENGLISH)); + } else { + final String result = time.format(DateTimeFormatter.ofPattern( + showAmPm ? "K:mm a" : "K:mm", Locale.ENGLISH)); + if (result.startsWith("0:")) { + return "12:" + result.substring(2); + } else { + return result; + } + } + } + } + + + /** + * @param n must be 0 <= n <= 999 + * @param ordinal whether to return an ordinal number (usually with -th) + * @return the string representation of a number smaller than 1000 + */ + private String subThousand(final long n, final boolean ordinal) { + // this function calls itself inside if branches to make sure `ordinal` is respected + if (ordinal && ORDINAL_NAMES.containsKey(n)) { + return ORDINAL_NAMES.get(n); + } else if (n < 100) { + if (!ordinal && NUMBER_NAMES.containsKey(n)) { + return NUMBER_NAMES.get(n); + } + // n is surely => 20 from here on, since all n < 20 are in (ORDINAL|NUMBER)_NAMES + + return NUMBER_NAMES.get(n - n % 10) + + (n % 10 > 0 ? " " + subThousand(n % 10, ordinal) : ""); + } else { + return NUMBER_NAMES.get(n / 100) + " hundred" + + (n % 100 > 0 ? " and " + subThousand(n % 100, ordinal) + : (ordinal ? "th" : "")); + } + } + + /** + * @param result the string builder to append the comma-separated group names to + * @param groupNames the group names + */ + private void appendSplitGroups(final StringBuilder result, final List groupNames) { + if (!groupNames.isEmpty()) { + result.append(groupNames.get(groupNames.size() - 1)); + } + + for (int i = groupNames.size() - 2; i >= 0; --i) { + result.append(", "); + result.append(groupNames.get(i)); + } + } +} From 997acf539da01e4591cb3d03b189fe4ede565d78 Mon Sep 17 00:00:00 2001 From: rauschen Date: Tue, 7 Feb 2023 09:53:36 +0100 Subject: [PATCH 08/13] [de-de] Tests: DateTimeConfigTest --- .../org/dicio/numbers/lang/de/DateTimeConfigTest.java | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 numbers/src/test/java/org/dicio/numbers/lang/de/DateTimeConfigTest.java diff --git a/numbers/src/test/java/org/dicio/numbers/lang/de/DateTimeConfigTest.java b/numbers/src/test/java/org/dicio/numbers/lang/de/DateTimeConfigTest.java new file mode 100644 index 00000000..e5f99662 --- /dev/null +++ b/numbers/src/test/java/org/dicio/numbers/lang/de/DateTimeConfigTest.java @@ -0,0 +1,10 @@ +package org.dicio.numbers.lang.de; + +import org.dicio.numbers.test.DateTimeConfigTestBase; + +public class DateTimeConfigTest extends DateTimeConfigTestBase { + @Override + public String configFolder() { + return "config/de-de"; + } +} From 7a034273f6a27058a67a2c4236b6607da43e8101 Mon Sep 17 00:00:00 2001 From: rauschen Date: Tue, 7 Feb 2023 14:28:29 +0100 Subject: [PATCH 09/13] [de-de] Test: DateTimeTest --- .../dicio/numbers/lang/de/DateTimeTest.java | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 numbers/src/test/java/org/dicio/numbers/lang/de/DateTimeTest.java diff --git a/numbers/src/test/java/org/dicio/numbers/lang/de/DateTimeTest.java b/numbers/src/test/java/org/dicio/numbers/lang/de/DateTimeTest.java new file mode 100644 index 00000000..a61c69e0 --- /dev/null +++ b/numbers/src/test/java/org/dicio/numbers/lang/de/DateTimeTest.java @@ -0,0 +1,46 @@ +package org.dicio.numbers.lang.de; + +import org.dicio.numbers.formatter.NumberFormatter; +import org.dicio.numbers.test.DateTimeTestBase; +import org.junit.Test; + +import java.time.LocalDate; +import java.time.LocalDateTime; + +import static org.junit.Assert.assertEquals; + +public class DateTimeTest extends DateTimeTestBase { + + @Override + public String configFolder() { + return "config/de-de"; + } + + @Override + public NumberFormatter buildNumberFormatter() { + return new GermanFormatter(); + } + + @Test + public void testNiceDate() { + // just check that the NumberParserFormatter functions do their job + assertEquals("Mittwoch, der achtundzwangzigste April zweitausendeinundzwanzig", + pf.niceDate(LocalDate.of(2021, 4, 28)).get()); + assertEquals("Sonntag, der dreizehnte August", + pf.niceDate(LocalDate.of(-84, 8, 13)).now(LocalDate.of(-84, 8, 23)).get()); + } + + @Test + public void testNiceYear() { + // just check that the NumberParserFormatter functions do their job + assertEquals("neunzehnhundertvierundachtzig", pf.niceYear(LocalDate.of(1984, 4, 28)).get()); + assertEquals("achhundertzehn v.d.Z.", pf.niceYear(LocalDate.of(-810, 8, 13)).get()); + } + + @Test + public void testNiceDateTime() { + // just check that the NumberParserFormatter functions do their job + assertEquals("Mittwoch, zwölfter September siebzehnhundertvierundsechzig, um zwölf Uhr mittags", pf.niceDateTime(LocalDateTime.of(1764, 9, 12, 12, 0)).get()); + assertEquals("Donnerstag, dritter November dreihundertachtundzwanzig v.d.Z. um fünf Uhr sieben", pf.niceDateTime(LocalDateTime.of(-328, 11, 3, 5, 7)).get()); + } +} From a34ce837e145087315ce75c00e3fb17df1434b8e Mon Sep 17 00:00:00 2001 From: rauschen Date: Tue, 7 Feb 2023 14:33:03 +0100 Subject: [PATCH 10/13] [de-de] Test: ExtractDurationTest --- .../numbers/lang/de/ExtractDurationTest.java | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 numbers/src/test/java/org/dicio/numbers/lang/de/ExtractDurationTest.java diff --git a/numbers/src/test/java/org/dicio/numbers/lang/de/ExtractDurationTest.java b/numbers/src/test/java/org/dicio/numbers/lang/de/ExtractDurationTest.java new file mode 100644 index 00000000..e718a560 --- /dev/null +++ b/numbers/src/test/java/org/dicio/numbers/lang/de/ExtractDurationTest.java @@ -0,0 +1,28 @@ +package org.dicio.numbers.lang.de; + +import static org.dicio.numbers.test.TestUtils.DAY; +import static org.dicio.numbers.test.TestUtils.t; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +import org.dicio.numbers.NumberParserFormatter; +import org.dicio.numbers.test.WithTokenizerTestBase; +import org.junit.Test; + +public class ExtractDurationTest extends WithTokenizerTestBase { + @Override + public String configFolder() { + return "config/de-de"; + } + + @Test + public void testNumberParserExtractDuration() { + final NumberParserFormatter npf + = new NumberParserFormatter(null, new GermanParser()); + assertNull(npf.extractDuration("hallo wie geht's").get()); + assertNull(npf.extractDuration("eine Milliarde Euro").shortScale(true).get()); + assertNull(npf.extractDuration("eine Million").shortScale(false).get()); + assertEquals(t(DAY), npf.extractDuration("vierundzwanzig Stunden sind nicht zwei Tage").get()); + assertEquals(t(2 * DAY), npf.extractDuration("zwei Tage sind nicht vierundzwanzig Stunden").get()); + } +} From 5c471607447231d93427ccc10e32ed0681ec9643 Mon Sep 17 00:00:00 2001 From: rauschen Date: Tue, 7 Feb 2023 14:42:30 +0100 Subject: [PATCH 11/13] [de-de] Test: NiceDurationTest --- .../numbers/lang/de/NiceDurationTest.java | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 numbers/src/test/java/org/dicio/numbers/lang/de/NiceDurationTest.java diff --git a/numbers/src/test/java/org/dicio/numbers/lang/de/NiceDurationTest.java b/numbers/src/test/java/org/dicio/numbers/lang/de/NiceDurationTest.java new file mode 100644 index 00000000..ae5711a3 --- /dev/null +++ b/numbers/src/test/java/org/dicio/numbers/lang/de/NiceDurationTest.java @@ -0,0 +1,72 @@ +package org.dicio.numbers.lang.de; + +import static org.dicio.numbers.test.TestUtils.F; +import static org.dicio.numbers.test.TestUtils.T; + +import org.dicio.numbers.formatter.NumberFormatter; +import org.dicio.numbers.test.NiceDurationTestBase; +import org.junit.Test; + +public class NiceDurationTest extends NiceDurationTestBase { + + @Override + public NumberFormatter buildNumberFormatter() { + return new GermanFormatter(); + } + + @Test + public void zero() { + assertDuration("null Sekunden", T, 0, 0, 0, 0); + assertDuration("0:00", F, 0, 0, 0, 0); + } + + @Test + public void speechOne() { + assertDuration("eine Sekunde", T, 0, 0, 0, 1); + assertDuration("eine Minute", T, 0, 0, 1, 0); + assertDuration("eine Stunde", T, 0, 1, 0, 0); + assertDuration("ein Tag", T, 1, 0, 0, 0); + } + + @Test + public void speechMany() { + assertDuration("fünf Sekunden", T, 0, 0, 0, 5); + assertDuration("zwei Minuten", T, 0, 0, 2, 0); + assertDuration("siebzehn Stunden", T, 0, 17, 0, 0); + assertDuration("vierundachtzig Tage", T, 84, 0, 0, 0); + } + + @Test + public void speech() { + assertDuration("sechs Tage dreiundzwanzig Stunden neunundfünfzig Minuten zweiunddreißig Sekunden", T, 6, 23, 59, 32); + assertDuration("neunzehn Tage zweiundfünfzig Minuten", T, 19, 0, 52, 0); + assertDuration("eine Stunde sechs Sekunden", T, 0, 1, 0, 6); + assertDuration("dreiundsechzig Tage vierundvierzig Sekunden", T, 63, 0, 0, 44); + assertDuration("ein Tag eine Stunde eine Minute eine Sekunde", T, 1, 1, 1, 1); + } + + @Test + public void noSpeechOne() { + assertDuration("0:01", F, 0, 0, 0, 1); + assertDuration("1:00", F, 0, 0, 1, 0); + assertDuration("1:00:00", F, 0, 1, 0, 0); + assertDuration("1d 0:00:00", F, 1, 0, 0, 0); + } + + @Test + public void noSpeechMany() { + assertDuration("0:39", F, 0, 0, 0, 39); + assertDuration("24:00", F, 0, 0, 24, 0); + assertDuration("3:00:00", F, 0, 3, 0, 0); + assertDuration("76d 0:00:00", F, 76, 0, 0, 0); + } + + @Test + public void noSpeech() { + assertDuration("6d 23:59:32", F, 6, 23, 59, 32); + assertDuration("19d 0:52:00", F, 19, 0, 52, 0); + assertDuration("1:00:06", F, 0, 1, 0, 6); + assertDuration("63d 0:00:44", F, 63, 0, 0, 44); + assertDuration("1d 1:01:01", F , 1, 1, 1, 1); + } +} From 7436566e07d8324d922e6ec56f0c55cfc086b7d3 Mon Sep 17 00:00:00 2001 From: rauschen Date: Tue, 7 Feb 2023 14:52:40 +0100 Subject: [PATCH 12/13] [de-de] Tests: NiceNumberTest --- .../dicio/numbers/lang/de/NiceNumberTest.java | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 numbers/src/test/java/org/dicio/numbers/lang/de/NiceNumberTest.java diff --git a/numbers/src/test/java/org/dicio/numbers/lang/de/NiceNumberTest.java b/numbers/src/test/java/org/dicio/numbers/lang/de/NiceNumberTest.java new file mode 100644 index 00000000..73402295 --- /dev/null +++ b/numbers/src/test/java/org/dicio/numbers/lang/de/NiceNumberTest.java @@ -0,0 +1,65 @@ +package org.dicio.numbers.lang.de; + +import org.dicio.numbers.NumberParserFormatter; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.util.Arrays; +import java.util.Collections; + +import static org.dicio.numbers.test.TestUtils.F; +import static org.junit.Assert.assertEquals; + +public class NiceNumberTest { + + private static NumberParserFormatter pf; + + @BeforeClass + public static void setup() { + pf = new NumberParserFormatter(new GermanFormatter(), null); + } + + @Test + public void speech() { + assertEquals("vierunddreißig und einhalb", pf.niceNumber(34.5).get()); + assertEquals("minus achtzehn und drei Fünftel", pf.niceNumber(-18.6).get()); + assertEquals("achtundneunzig und achtzehn Neunzehntel", pf.niceNumber(98.947368421).get()); + assertEquals("minus fünf und sechs Elftel", pf.niceNumber(-5.5454545).get()); + assertEquals("sieben Neuntel", pf.niceNumber(7.0 / 9).get()); + assertEquals("minus zwei Siebzehntel", pf.niceNumber(-2.0 / 17).get()); + assertEquals("vierhundertfünfundsechzig", pf.niceNumber(465).get()); + assertEquals("minus einundneunzig", pf.niceNumber(-91).get()); + assertEquals("null", pf.niceNumber(0).get()); + } + + @Test + public void noSpeech() { + assertEquals("34 1/2", pf.niceNumber(34.5).speech(F).get()); + assertEquals("-18 3/5", pf.niceNumber(-18.6).speech(F).get()); + assertEquals("98 18/19", pf.niceNumber(98.947368421).speech(F).get()); + assertEquals("-5 6/11", pf.niceNumber(-5.5454545).speech(F).get()); + assertEquals("7/9", pf.niceNumber(7.0 / 9).speech(F).get()); + assertEquals("-2/17", pf.niceNumber(-2.0 / 17).speech(F).get()); + assertEquals("465", pf.niceNumber(465).speech(F).get()); + assertEquals("-91", pf.niceNumber(-91).speech(F).get()); + assertEquals("0", pf.niceNumber(0).speech(F).get()); + } + + @Test + public void customDenominators() { + assertEquals("minus vier und vier Zehntel", pf.niceNumber(-4.4).denominators(Arrays.asList(2, 3, 4, 6, 7, 8, 9, 10, 11)).get()); + assertEquals("-64 6/12", pf.niceNumber(-64.5).speech(F).denominators(Collections.singletonList(12)).get()); + assertEquals("minus drei und fünfhunderttausend Millionstel", pf.niceNumber(-3.5).denominators(Arrays.asList(1000000, 2000000)).get()); + assertEquals("9 1000000/2000000", pf.niceNumber(9.5).speech(F).denominators(Arrays.asList(2000000, 1000000)).get()); + assertEquals("null komma acht", pf.niceNumber(4.0 / 5).denominators(Arrays.asList(2, 3, 4)).get()); + } + + @Test + public void invalidFraction() { + assertEquals("eins komma acht vier", pf.niceNumber(1.837).get()); + assertEquals("minus achtunddreißig komma eins neun", pf.niceNumber(-38.192).get()); + assertEquals("3829.48", pf.niceNumber(3829.47832).speech(F).get()); + assertEquals("-7.19", pf.niceNumber(-7.1928).speech(F).get()); + assertEquals("-9322.38", pf.niceNumber(-9322 - 8.0 / 21).speech(F).get()); + } +} From 8c198e9d72d6bd385b27465272ccc6fe0cee00e1 Mon Sep 17 00:00:00 2001 From: Stypox Date: Thu, 2 Oct 2025 11:33:36 +0200 Subject: [PATCH 13/13] Convert to Kotlin --- .../numbers/lang/de/GermanFormatter.java | 455 ----------------- .../dicio/numbers/lang/de/GermanFormatter.kt | 463 ++++++++++++++++++ .../dicio/numbers/lang/de/GermanParser.java | 42 -- .../dicio/numbers/lang/de/DateTimeTest.java | 4 +- .../numbers/lang/de/ExtractDurationTest.java | 16 +- .../numbers/lang/de/NiceDurationTest.java | 4 +- .../dicio/numbers/lang/de/NiceNumberTest.java | 6 +- 7 files changed, 478 insertions(+), 512 deletions(-) delete mode 100644 numbers/src/main/java/org/dicio/numbers/lang/de/GermanFormatter.java create mode 100644 numbers/src/main/java/org/dicio/numbers/lang/de/GermanFormatter.kt delete mode 100644 numbers/src/main/java/org/dicio/numbers/lang/de/GermanParser.java diff --git a/numbers/src/main/java/org/dicio/numbers/lang/de/GermanFormatter.java b/numbers/src/main/java/org/dicio/numbers/lang/de/GermanFormatter.java deleted file mode 100644 index 5e905929..00000000 --- a/numbers/src/main/java/org/dicio/numbers/lang/de/GermanFormatter.java +++ /dev/null @@ -1,455 +0,0 @@ -package org.dicio.numbers.lang.de; - -import org.dicio.numbers.formatter.NumberFormatter; -import org.dicio.numbers.util.MixedFraction; -import org.dicio.numbers.util.Utils; - -import java.time.LocalTime; -import java.time.format.DateTimeFormatter; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Locale; -import java.util.Map; - -public class GermanFormatter extends NumberFormatter { - - final Map NUMBER_NAMES = new HashMap() {{ - put(0L, "null"); - put(1L, "eins"); - put(2L, "zwei"); - put(3L, "drei"); - put(4L, "vier"); - put(5L, "fünf"); - put(6L, "sechs"); - put(7L, "sieben"); - put(8L, "acht"); - put(9L, "neun"); - put(10L, "zehn"); - put(11L, "elf"); - put(12L, "zwölf"); - put(13L, "dreizehn"); - put(14L, "vierzehn"); - put(15L, "fünfzehn"); - put(16L, "sechzehn"); - put(17L, "siebzehn"); - put(18L, "achtzehn"); - put(19L, "neunzehn"); - put(20L, "zwanzig"); - put(30L, "dreißig"); - put(40L, "vierzig"); - put(50L, "fünfzig"); - put(60L, "sechzig"); - put(70L, "siebzig"); - put(80L, "achtzig"); - put(90L, "neunzig"); - put(100L, "hundert"); - put(1000L, "tausend"); - put(1000000L, "million"); - put(1000000000L, "milliarde"); - put(1000000000000L, "billion"); - put(1000000000000000L, "billiarde"); - put(1000000000000000000L, "trillion"); - }}; - - final Map ORDINAL_NAMES = new HashMap() {{ - put(1L, "erste"); - put(2L, "zweite"); - put(3L, "dritte"); - put(4L, "vierte"); - put(5L, "fünfte"); - put(6L, "sechste"); - put(7L, "siebte"); - put(8L, "achte"); - put(9L, "neunte"); - put(10L, "zehnte"); - put(11L, "elfte"); - put(12L, "zwölfte"); - put(13L, "dreizehnte"); - put(14L, "vierzehnte"); - put(15L, "fünfzehnte"); - put(16L, "sechzehnte"); - put(17L, "siebzehnte"); - put(18L, "achtzehnte"); - put(19L, "neunzehnte"); - put(20L, "zwanzigste"); - put(30L, "dreißigste"); - put(40L, "vierzigste"); - put(50L, "fünfzigste"); - put(60L, "sechzigste"); - put(70L, "siebzigste"); - put(80L, "achtzigste"); - put(90L, "neunzigste"); - put(100L, "hundertste"); - put(1000L, "tausendste"); - put(1000000L, "millionste"); - put(1000000000L, "milliardste"); - put(1000000000000L, "billionste"); - put(1000000000000000L, "billiardste"); - put(1000000000000000000L, "trilliardste"); - }}; - - public GermanFormatter() { - super("config/de-de"); - } - - - - /** - * Format a number to a pronounceable representation. For example, -4000619 would be formatted - * into "minus four million, six hundred and nineteen" for English. - * - * @param number the number to pronounce - * @param places the number of decimal places to round decimal numbers to - * @param shortScale use short (true) or long (false) scale for large numbers (see - * - * Names of large numbers) - * @param scientific if true convert and pronounce in scientific notation - * @param ordinal if true pronounce in the ordinal form (e.g. "first" instead of "one" for - * English) - * @return the formatted number as a string - * - public abstract String pronounceNumber(double number, - int places, - boolean shortScale, - boolean scientific, - boolean ordinal); - -*/ - - @Override - public String niceNumber(final MixedFraction mixedFraction, final boolean speech) { - if (speech) { - final String sign = mixedFraction.negative ? "minus " : ""; - if (mixedFraction.numerator == 0) { - return sign + pronounceNumber(mixedFraction.whole, 0, true, false, false); - } - - String denominatorString; - if (mixedFraction.denominator == 1) { - denominatorString = "Eintel"; - } else if (mixedFraction.denominator == 2) { - denominatorString = "Halbe"; - } else if (mixedFraction.denominator == 3) { - denominatorString = "Drittel"; - } else if (mixedFraction.denominator == 7) { - denominatorString = "Siebtel"; - } else if (mixedFraction.denominator << 20) { - // below 20 use number name + suffix "tel" - denominatorString - = pronounceNumber(mixedFraction.denominator, 0, true, false, true) + "tel"; - } else { - // for 20+ use number name + suffix "stel" - denominatorString - = pronounceNumber(mixedFraction.denominator, 0, true, false, true) + "stel"; - } - - final String numeratorString; - numeratorString = pronounceNumber(mixedFraction.numerator, 0, true, false, false); - - if (mixedFraction.whole == 0) { - return sign + numeratorString + " " + denominatorString; - } else { - return sign + pronounceNumber(mixedFraction.whole, 0, true, false, false) - + " und " + numeratorString + " " + denominatorString; - } - - } else { - return niceNumberNotSpeech(mixedFraction); - } - } - - @Override - public String pronounceNumber(double number, - final int places, - final boolean shortScale, - final boolean scientific, - final boolean ordinal) { - - if (number == Double.POSITIVE_INFINITY) { - return "unendlich"; - } else if (number == Double.NEGATIVE_INFINITY) { - return "minus unendlich"; - } else if (Double.isNaN(number)) { - return "keine Zahl"; - } - - // also using scientific mode if the number is too big to be spoken fully. Checking against - // the biggest double smaller than 10^21 = 1000 * 10^18, which is the biggest pronounceable - // number, since e.g. 999.99 * 10^18 can be pronounced correctly. - if (scientific || Math.abs(number) > 999999999999999934463d) { - final String scientificFormatted = String.format(Locale.ENGLISH, "%E", number); - final String[] parts = scientificFormatted.split("E", 2); - final double power = Integer.parseInt(parts[1]); - - if (power != 0) { - // This handles negatives of powers separately from the normal - // handling since each call disables the scientific flag - final double n = Double.parseDouble(parts[0]); - return String.format("%s mal zehn hoch %s", - pronounceNumber(Math.abs(n), places, shortScale, false, false), - pronounceNumber(Math.abs(power), places, shortScale, false, false)); - } - } - - final StringBuilder result = new StringBuilder(); - if (number < 0) { - number = -number; - // from here on number is always positive - if (places != 0 || number >= 0.5) { - // do not add minus if number will be rounded to 0 - result.append(scientific ? "negative " : "minus "); - } - } - - final int realPlaces = Utils.decimalPlacesNoFinalZeros(number, places); - final boolean numberIsWhole = realPlaces == 0; - // if no decimal places to be printed, numberLong should be the rounded number - final long numberLong = (long) number + (number % 1 >= 0.5 && numberIsWhole ? 1 : 0); - - if (!ordinal && numberIsWhole && numberLong > 1000 && numberLong < 2000) { - // deal with 4 digits that can be said like a date, i.e. 1972 => nineteen seventy two - - result.append(NUMBER_NAMES.get(numberLong / 100)); - result.append(" "); - if (numberLong % 100 == 0) { - // 1900 => nineteen hundred - result.append(NUMBER_NAMES.get(100L)); - } else if (numberLong % 100 < 10 && numberLong % 100 != 0) { - // 1906 => nineteen oh six - result.append("oh "); - result.append(NUMBER_NAMES.get(numberLong % 10)); - } else if (numberLong % 10 == 0 || numberLong % 100 < 20) { - // 1960 => nineteen sixty; 1911 => nineteen eleven - result.append(NUMBER_NAMES.get(numberLong % 100)); - } else { - // 1961 => nineteen sixty one - result.append(NUMBER_NAMES.get(numberLong % 100 - numberLong % 10)); - result.append(" "); - result.append(NUMBER_NAMES.get(numberLong % 10)); - } - - return result.toString(); - } - - if (!ordinal && NUMBER_NAMES.containsKey(numberLong)) { - if (number > 90) { - result.append("one "); - } - result.append(NUMBER_NAMES.get(numberLong)); - - } else if (shortScale) { - boolean ordi = ordinal && numberIsWhole; // not ordinal if not whole - final List groups = Utils.splitByModulus(numberLong, 1000); - final List groupNames = new ArrayList<>(); - for (int i = 0; i < groups.size(); ++i) { - final long z = groups.get(i); - if (z == 0) { - continue; // skip 000 groups - } - String groupName = subThousand(z, i == 0 && ordi); - - if (i != 0) { - final long magnitude = Utils.longPow(1000, i); - if (ordi) { - // ordi can be true only for the first group (i.e. at the end of the number) - if (z == 1) { - // remove "one" from first group (e.g. "one billion, millionth") - groupName = ORDINAL_NAMES_SHORT_SCALE.get(magnitude); - } else { - groupName += " " + ORDINAL_NAMES_SHORT_SCALE.get(magnitude); - } - } else { - groupName += " " + NUMBER_NAMES_SHORT_SCALE.get(magnitude); - } - } - - groupNames.add(groupName); - ordi = false; - } - - appendSplitGroups(result, groupNames); - - } else { - boolean ordi = ordinal && numberIsWhole; // not ordinal if not whole - final List groups = Utils.splitByModulus(numberLong, 1000000); - final List groupNames = new ArrayList<>(); - for (int i = 0; i < groups.size(); ++i) { - final long z = groups.get(i); - if (z == 0) { - continue; // skip 000000 groups - } - - String groupName; - if (z < 1000) { - groupName = subThousand(z, i == 0 && ordi); - } else { - groupName = subThousand(z / 1000, false) + " thousand"; - if (z % 1000 != 0) { - groupName += (i == 0 ? ", " : " ") + subThousand(z % 1000, i == 0 && ordi); - } else if (i == 0 && ordi) { - if (z / 1000 == 1) { - groupName = "thousandth"; // remove "one" from "one thousandth" - } else { - groupName += "th"; - } - } - } - - if (i != 0) { - final long magnitude = Utils.longPow(1000000, i); - if (ordi) { - // ordi can be true only for the first group (i.e. at the end of the number) - if (z == 1) { - // remove "one" from first group (e.g. "one billion, millionth") - groupName = ORDINAL_NAMES_LONG_SCALE.get(magnitude); - } else { - groupName += " " + ORDINAL_NAMES_LONG_SCALE.get(magnitude); - } - } else { - groupName += " " + NUMBER_NAMES_LONG_SCALE.get(magnitude); - } - } - - groupNames.add(groupName); - ordi = false; - } - - appendSplitGroups(result, groupNames); - } - - if (realPlaces > 0) { - if (number < 1.0 && (result.length() == 0 || "minus ".contentEquals(result))) { - result.append("zero"); // nothing was written before - } - result.append(" point"); - - final String fractionalPart = String.format("%." + realPlaces + "f", number % 1); - for (int i = 2; i < fractionalPart.length(); ++i) { - result.append(" "); - result.append(NUMBER_NAMES.get((long) (fractionalPart.charAt(i) - '0'))); - } - } - - return result.toString(); - } - - @Override - public String niceTime(final LocalTime time, - final boolean speech, - final boolean use24Hour, - final boolean showAmPm) { - if (speech) { - if (use24Hour) { - final StringBuilder result = new StringBuilder(); - if (time.getHour() < 10) { - result.append("zero "); - } - result.append(pronounceNumberDuration(time.getHour())); - - result.append(" "); - if (time.getMinute() == 0) { - result.append("hundred"); - } else { - if (time.getMinute() < 10) { - result.append("zero "); - } - result.append(pronounceNumberDuration(time.getMinute())); - } - - return result.toString(); - } else { - if (time.getHour() == 0 && time.getMinute() == 0) { - return "midnight"; - } else if (time.getHour() == 12 && time.getMinute() == 0) { - return "noon"; - } - - final int normalizedHour = (time.getHour() + 11) % 12 + 1; // 1 to 12 - final StringBuilder result = new StringBuilder(); - if (time.getMinute() == 15) { - result.append("quarter past "); - result.append(pronounceNumberDuration(normalizedHour)); - } else if (time.getMinute() == 30) { - result.append("half past "); - result.append(pronounceNumberDuration(normalizedHour)); - } else if (time.getMinute() == 45) { - result.append("quarter to "); - result.append(pronounceNumberDuration(normalizedHour % 12 + 1)); - } else { - result.append(pronounceNumberDuration(normalizedHour)); - - if (time.getMinute() == 0) { - if (!showAmPm) { - return result + " o'clock"; - } - } else { - if (time.getMinute() < 10) { - result.append(" oh"); - } - result.append(" "); - result.append(pronounceNumberDuration(time.getMinute())); - } - } - - if (showAmPm) { - result.append(time.getHour() >= 12 ? " p.m." : " a.m."); - } - return result.toString(); - } - - } else { - if (use24Hour) { - return time.format(DateTimeFormatter.ofPattern("HH:mm", Locale.ENGLISH)); - } else { - final String result = time.format(DateTimeFormatter.ofPattern( - showAmPm ? "K:mm a" : "K:mm", Locale.ENGLISH)); - if (result.startsWith("0:")) { - return "12:" + result.substring(2); - } else { - return result; - } - } - } - } - - - /** - * @param n must be 0 <= n <= 999 - * @param ordinal whether to return an ordinal number (usually with -th) - * @return the string representation of a number smaller than 1000 - */ - private String subThousand(final long n, final boolean ordinal) { - // this function calls itself inside if branches to make sure `ordinal` is respected - if (ordinal && ORDINAL_NAMES.containsKey(n)) { - return ORDINAL_NAMES.get(n); - } else if (n < 100) { - if (!ordinal && NUMBER_NAMES.containsKey(n)) { - return NUMBER_NAMES.get(n); - } - // n is surely => 20 from here on, since all n < 20 are in (ORDINAL|NUMBER)_NAMES - - return NUMBER_NAMES.get(n - n % 10) - + (n % 10 > 0 ? " " + subThousand(n % 10, ordinal) : ""); - } else { - return NUMBER_NAMES.get(n / 100) + " hundred" - + (n % 100 > 0 ? " and " + subThousand(n % 100, ordinal) - : (ordinal ? "th" : "")); - } - } - - /** - * @param result the string builder to append the comma-separated group names to - * @param groupNames the group names - */ - private void appendSplitGroups(final StringBuilder result, final List groupNames) { - if (!groupNames.isEmpty()) { - result.append(groupNames.get(groupNames.size() - 1)); - } - - for (int i = groupNames.size() - 2; i >= 0; --i) { - result.append(", "); - result.append(groupNames.get(i)); - } - } -} diff --git a/numbers/src/main/java/org/dicio/numbers/lang/de/GermanFormatter.kt b/numbers/src/main/java/org/dicio/numbers/lang/de/GermanFormatter.kt new file mode 100644 index 00000000..f5655e69 --- /dev/null +++ b/numbers/src/main/java/org/dicio/numbers/lang/de/GermanFormatter.kt @@ -0,0 +1,463 @@ +package org.dicio.numbers.lang.de + +import org.dicio.numbers.formatter.Formatter +import org.dicio.numbers.lang.en.EnglishFormatter +import org.dicio.numbers.unit.MixedFraction +import org.dicio.numbers.util.Utils.decimalPlacesNoFinalZeros +import org.dicio.numbers.util.Utils.longPow +import org.dicio.numbers.util.Utils.splitByModulus +import java.time.LocalTime +import java.time.format.DateTimeFormatter +import java.util.Locale +import kotlin.math.abs + +class GermanFormatter : Formatter("config/de-de") { + override fun niceNumber(mixedFraction: MixedFraction, speech: Boolean): String { + if (speech) { + val sign = if (mixedFraction.negative) "minus " else "" + if (mixedFraction.numerator == 0) { + return sign + pronounceNumber(mixedFraction.whole.toDouble(), 0, shortScale = true, + scientific = false, + ordinal = false + ) + } + + val denominatorString: String? + if (mixedFraction.denominator == 1) { + denominatorString = "Eintel" + } else if (mixedFraction.denominator == 2) { + denominatorString = "Halbe" + } else if (mixedFraction.denominator == 3) { + denominatorString = "Drittel" + } else if (mixedFraction.denominator == 7) { + denominatorString = "Siebtel" + } else if (mixedFraction.denominator < 20) { + // below 20 use number name + suffix "tel" + denominatorString = pronounceNumber( + mixedFraction.denominator.toDouble(), + 0, + shortScale = true, + scientific = false, + ordinal = true + ) + "tel" + } else { + // for 20+ use number name + suffix "stel" + denominatorString = pronounceNumber( + mixedFraction.denominator.toDouble(), + 0, + shortScale = true, + scientific = false, + ordinal = true + ) + "stel" + } + val numeratorString = pronounceNumber(mixedFraction.numerator.toDouble(), 0, + shortScale = true, + scientific = false, + ordinal = false + ) + + return if (mixedFraction.whole == 0L) { + "$sign$numeratorString $denominatorString" + } else { + (sign + pronounceNumber( + mixedFraction.whole.toDouble(), + 0, + shortScale = true, + scientific = false, + ordinal = false + ) + " und " + numeratorString + " " + denominatorString) + } + } else { + return niceNumberNotSpeech(mixedFraction) + } + } + + override fun pronounceNumber( + number: Double, + places: Int, + shortScale: Boolean, + scientific: Boolean, + ordinal: Boolean + ): String { + var number = number + if (number == Double.POSITIVE_INFINITY) { + return "unendlich" + } else if (number == Double.Companion.NEGATIVE_INFINITY) { + return "minus unendlich" + } else if (number.isNaN()) { + return "keine Zahl" + } + + // also using scientific mode if the number is too big to be spoken fully. Checking against + // the biggest double smaller than 10^21 = 1000 * 10^18, which is the biggest pronounceable + // number, since e.g. 999.99 * 10^18 can be pronounced correctly. + if (scientific || abs(number) > 999999999999999934463.0) { + val scientificFormatted = String.format(Locale.ENGLISH, "%E", number) + val parts: Array = + scientificFormatted.split("E".toRegex(), limit = 2).toTypedArray() + val power = parts[1]!!.toInt().toDouble() + + if (power != 0.0) { + // This handles negatives of powers separately from the normal + // handling since each call disables the scientific flag + val n = parts[0]!!.toDouble() + return String.format( + "%s mal zehn hoch %s", + pronounceNumber(abs(n), places, shortScale, + scientific = false, + ordinal = false + ), + pronounceNumber(abs(power), places, shortScale, + scientific = false, + ordinal = false + ) + ) + } + } + + val result = StringBuilder() + if (number < 0) { + number = -number + // from here on number is always positive + if (places != 0 || number >= 0.5) { + // do not add minus if number will be rounded to 0 + result.append(if (scientific) "negative " else "minus ") + } + } + + val realPlaces = decimalPlacesNoFinalZeros(number, places) + val numberIsWhole = realPlaces == 0 + // if no decimal places to be printed, numberLong should be the rounded number + val numberLong = number.toLong() + (if (number % 1 >= 0.5 && numberIsWhole) 1 else 0) + + if (!ordinal && numberIsWhole && numberLong > 1000 && numberLong < 2000) { + // deal with 4 digits that can be said like a date, i.e. 1972 => nineteen seventy two + + result.append(NUMBER_NAMES[numberLong / 100]) + result.append(" ") + if (numberLong % 100 == 0L) { + // 1900 => nineteen hundred + result.append(NUMBER_NAMES[100L]) + } else if (numberLong % 100 < 10 && numberLong % 100 != 0L) { + // 1906 => nineteen oh six + result.append("oh ") + result.append(NUMBER_NAMES[numberLong % 10]) + } else if (numberLong % 10 == 0L || numberLong % 100 < 20) { + // 1960 => nineteen sixty; 1911 => nineteen eleven + result.append(NUMBER_NAMES[numberLong % 100]) + } else { + // 1961 => nineteen sixty one + result.append(NUMBER_NAMES[numberLong % 100 - numberLong % 10]) + result.append(" ") + result.append(NUMBER_NAMES[numberLong % 10]) + } + + return result.toString() + } + + if (!ordinal && NUMBER_NAMES.containsKey(numberLong)) { + if (number > 90) { + result.append("one ") + } + result.append(NUMBER_NAMES[numberLong]) + } else if (shortScale) { + var ordi = ordinal && numberIsWhole // not ordinal if not whole + val groups = splitByModulus(numberLong, 1000) + val groupNames: MutableList = ArrayList() + for (i in groups.indices) { + val z: Long = groups[i] + if (z == 0L) { + continue // skip 000 groups + } + var groupName = subThousand(z, i == 0 && ordi) + + if (i != 0) { + val magnitude = longPow(1000, i) + if (ordi) { + // ordi can be true only for the first group (i.e. at the end of the number) + if (z == 1L) { + // remove "one" from first group (e.g. "one billion, millionth") + groupName = + EnglishFormatter.Companion.ORDINAL_NAMES_SHORT_SCALE[magnitude] + } else { + groupName += " " + EnglishFormatter.Companion.ORDINAL_NAMES_SHORT_SCALE[magnitude] + } + } else { + groupName += " " + EnglishFormatter.Companion.NUMBER_NAMES_SHORT_SCALE[magnitude] + } + } + + groupNames.add(groupName) + ordi = false + } + + appendSplitGroups(result, groupNames) + } else { + var ordi = ordinal && numberIsWhole // not ordinal if not whole + val groups = splitByModulus(numberLong, 1000000) + val groupNames: MutableList = ArrayList() + for (i in groups.indices) { + val z: Long = groups[i] + if (z == 0L) { + continue // skip 000000 groups + } + + var groupName: String? + if (z < 1000) { + groupName = subThousand(z, i == 0 && ordi) + } else { + groupName = subThousand(z / 1000, false) + " thousand" + if (z % 1000 != 0L) { + groupName += (if (i == 0) ", " else " ") + subThousand( + z % 1000, + i == 0 && ordi + ) + } else if (i == 0 && ordi) { + if (z / 1000 == 1L) { + groupName = "thousandth" // remove "one" from "one thousandth" + } else { + groupName += "th" + } + } + } + + if (i != 0) { + val magnitude = longPow(1000000, i) + if (ordi) { + // ordi can be true only for the first group (i.e. at the end of the number) + if (z == 1L) { + // remove "one" from first group (e.g. "one billion, millionth") + groupName = + EnglishFormatter.Companion.ORDINAL_NAMES_LONG_SCALE[magnitude] + } else { + groupName += " " + EnglishFormatter.Companion.ORDINAL_NAMES_LONG_SCALE[magnitude] + } + } else { + groupName += " " + EnglishFormatter.Companion.NUMBER_NAMES_LONG_SCALE[magnitude] + } + } + + groupNames.add(groupName) + ordi = false + } + + appendSplitGroups(result, groupNames) + } + + if (realPlaces > 0) { + if (number < 1.0 && (result.isEmpty() || "minus ".contentEquals(result))) { + result.append("zero") // nothing was written before + } + result.append(" point") + + val fractionalPart = String.format("%." + realPlaces + "f", number % 1) + for (i in 2..= 12) " p.m." else " a.m.") + } + return result.toString() + } + } else { + return if (use24Hour) { + time.format(DateTimeFormatter.ofPattern("HH:mm", Locale.ENGLISH)) + } else { + val result = time.format( + DateTimeFormatter.ofPattern( + if (showAmPm) "K:mm a" else "K:mm", Locale.ENGLISH + ) + ) + if (result.startsWith("0:")) { + "12:" + result.substring(2) + } else { + result + } + } + } + } + + + /** + * @param n must be 0 <= n <= 999 + * @param ordinal whether to return an ordinal number (usually with -th) + * @return the string representation of a number smaller than 1000 + */ + private fun subThousand(n: Long, ordinal: Boolean): String? { + // this function calls itself inside if branches to make sure `ordinal` is respected + if (ordinal && ORDINAL_NAMES.containsKey(n)) { + return ORDINAL_NAMES[n] + } else if (n < 100) { + if (!ordinal && NUMBER_NAMES.containsKey(n)) { + return NUMBER_NAMES[n] + } + + // n is surely => 20 from here on, since all n < 20 are in (ORDINAL|NUMBER)_NAMES + return (NUMBER_NAMES[n - n % 10] + + (if (n % 10 > 0) " " + subThousand(n % 10, ordinal) else "")) + } else { + return (NUMBER_NAMES[n / 100] + " hundred" + + (if (n % 100 > 0) + " and " + subThousand(n % 100, ordinal) + else + (if (ordinal) "th" else ""))) + } + } + + /** + * @param result the string builder to append the comma-separated group names to + * @param groupNames the group names + */ + private fun appendSplitGroups(result: StringBuilder, groupNames: MutableList) { + if (!groupNames.isEmpty()) { + result.append(groupNames[groupNames.size - 1]) + } + + for (i in groupNames.size - 2 downTo 0) { + result.append(", ") + result.append(groupNames[i]) + } + } + + companion object { + val NUMBER_NAMES = mapOf( + 0L to "null", + 1L to "eins", + 2L to "zwei", + 3L to "drei", + 4L to "vier", + 5L to "fünf", + 6L to "sechs", + 7L to "sieben", + 8L to "acht", + 9L to "neun", + 10L to "zehn", + 11L to "elf", + 12L to "zwölf", + 13L to "dreizehn", + 14L to "vierzehn", + 15L to "fünfzehn", + 16L to "sechzehn", + 17L to "siebzehn", + 18L to "achtzehn", + 19L to "neunzehn", + 20L to "zwanzig", + 30L to "dreißig", + 40L to "vierzig", + 50L to "fünfzig", + 60L to "sechzig", + 70L to "siebzig", + 80L to "achtzig", + 90L to "neunzig", + 100L to "hundert", + 1000L to "tausend", + 1000000L to "million", + 1000000000L to "milliarde", + 1000000000000L to "billion", + 1000000000000000L to "billiarde", + 1000000000000000000L to "trillion", + ) + + val ORDINAL_NAMES = mapOf( + 1L to "erste", + 2L to "zweite", + 3L to "dritte", + 4L to "vierte", + 5L to "fünfte", + 6L to "sechste", + 7L to "siebte", + 8L to "achte", + 9L to "neunte", + 10L to "zehnte", + 11L to "elfte", + 12L to "zwölfte", + 13L to "dreizehnte", + 14L to "vierzehnte", + 15L to "fünfzehnte", + 16L to "sechzehnte", + 17L to "siebzehnte", + 18L to "achtzehnte", + 19L to "neunzehnte", + 20L to "zwanzigste", + 30L to "dreißigste", + 40L to "vierzigste", + 50L to "fünfzigste", + 60L to "sechzigste", + 70L to "siebzigste", + 80L to "achtzigste", + 90L to "neunzigste", + 100L to "hundertste", + 1000L to "tausendste", + 1000000L to "millionste", + 1000000000L to "milliardste", + 1000000000000L to "billionste", + 1000000000000000L to "billiardste", + 1000000000000000000L to "trilliardste", + ) + } +} diff --git a/numbers/src/main/java/org/dicio/numbers/lang/de/GermanParser.java b/numbers/src/main/java/org/dicio/numbers/lang/de/GermanParser.java deleted file mode 100644 index 0573b1a7..00000000 --- a/numbers/src/main/java/org/dicio/numbers/lang/de/GermanParser.java +++ /dev/null @@ -1,42 +0,0 @@ -package org.dicio.numbers.lang.de; - -import org.dicio.numbers.parser.NumberParser; -import org.dicio.numbers.parser.lexer.TokenStream; -import org.dicio.numbers.util.DurationExtractorUtils; - -import java.time.Duration; -import java.time.LocalDateTime; -import java.time.LocalTime; -import java.util.List; - -public class GermanParser extends NumberParser { - - public GermanParser() { - super("config/de-de"); - } - - - @Override - public List extractNumbers(final String utterance, - final boolean shortScale, - final boolean preferOrdinal) { - return new GermanNumberExtractor(new TokenStream(tokenizer.tokenize(utterance)), - shortScale, preferOrdinal).extractNumbers(); - } - - @Override - public Duration extractDuration(final String utterance, final boolean shortScale) { - final TokenStream tokenStream = new TokenStream(tokenizer.tokenize(utterance)); - final GermanNumberExtractor numberExtractor - = new GermanNumberExtractor(tokenStream, shortScale, false); - return DurationExtractorUtils.extractDuration(tokenStream, - numberExtractor::extractOneNumberNoOrdinal); - } - - @Override - public LocalDateTime extractDateTime(final String utterance, - final boolean anchorDate, - final LocalTime defaultTime) { - return null; - } -} diff --git a/numbers/src/test/java/org/dicio/numbers/lang/de/DateTimeTest.java b/numbers/src/test/java/org/dicio/numbers/lang/de/DateTimeTest.java index a61c69e0..4f73e6a0 100644 --- a/numbers/src/test/java/org/dicio/numbers/lang/de/DateTimeTest.java +++ b/numbers/src/test/java/org/dicio/numbers/lang/de/DateTimeTest.java @@ -1,6 +1,6 @@ package org.dicio.numbers.lang.de; -import org.dicio.numbers.formatter.NumberFormatter; +import org.dicio.numbers.formatter.Formatter; import org.dicio.numbers.test.DateTimeTestBase; import org.junit.Test; @@ -17,7 +17,7 @@ public String configFolder() { } @Override - public NumberFormatter buildNumberFormatter() { + public Formatter buildNumberFormatter() { return new GermanFormatter(); } diff --git a/numbers/src/test/java/org/dicio/numbers/lang/de/ExtractDurationTest.java b/numbers/src/test/java/org/dicio/numbers/lang/de/ExtractDurationTest.java index e718a560..ea70e177 100644 --- a/numbers/src/test/java/org/dicio/numbers/lang/de/ExtractDurationTest.java +++ b/numbers/src/test/java/org/dicio/numbers/lang/de/ExtractDurationTest.java @@ -5,7 +5,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; -import org.dicio.numbers.NumberParserFormatter; +import org.dicio.numbers.ParserFormatter; import org.dicio.numbers.test.WithTokenizerTestBase; import org.junit.Test; @@ -17,12 +17,12 @@ public String configFolder() { @Test public void testNumberParserExtractDuration() { - final NumberParserFormatter npf - = new NumberParserFormatter(null, new GermanParser()); - assertNull(npf.extractDuration("hallo wie geht's").get()); - assertNull(npf.extractDuration("eine Milliarde Euro").shortScale(true).get()); - assertNull(npf.extractDuration("eine Million").shortScale(false).get()); - assertEquals(t(DAY), npf.extractDuration("vierundzwanzig Stunden sind nicht zwei Tage").get()); - assertEquals(t(2 * DAY), npf.extractDuration("zwei Tage sind nicht vierundzwanzig Stunden").get()); + final ParserFormatter npf + = new ParserFormatter(null, null); + assertNull(npf.extractDuration("hallo wie geht's").getFirst()); + assertNull(npf.extractDuration("eine Milliarde Euro").shortScale(true).getFirst()); + assertNull(npf.extractDuration("eine Million").shortScale(false).getFirst()); + assertEquals(t(DAY), npf.extractDuration("vierundzwanzig Stunden sind nicht zwei Tage").getFirst().toJavaDuration()); + assertEquals(t(2 * DAY), npf.extractDuration("zwei Tage sind nicht vierundzwanzig Stunden").getFirst().toJavaDuration()); } } diff --git a/numbers/src/test/java/org/dicio/numbers/lang/de/NiceDurationTest.java b/numbers/src/test/java/org/dicio/numbers/lang/de/NiceDurationTest.java index ae5711a3..e8ea62ff 100644 --- a/numbers/src/test/java/org/dicio/numbers/lang/de/NiceDurationTest.java +++ b/numbers/src/test/java/org/dicio/numbers/lang/de/NiceDurationTest.java @@ -3,14 +3,14 @@ import static org.dicio.numbers.test.TestUtils.F; import static org.dicio.numbers.test.TestUtils.T; -import org.dicio.numbers.formatter.NumberFormatter; +import org.dicio.numbers.formatter.Formatter; import org.dicio.numbers.test.NiceDurationTestBase; import org.junit.Test; public class NiceDurationTest extends NiceDurationTestBase { @Override - public NumberFormatter buildNumberFormatter() { + public Formatter buildNumberFormatter() { return new GermanFormatter(); } diff --git a/numbers/src/test/java/org/dicio/numbers/lang/de/NiceNumberTest.java b/numbers/src/test/java/org/dicio/numbers/lang/de/NiceNumberTest.java index 73402295..89e6d607 100644 --- a/numbers/src/test/java/org/dicio/numbers/lang/de/NiceNumberTest.java +++ b/numbers/src/test/java/org/dicio/numbers/lang/de/NiceNumberTest.java @@ -1,6 +1,6 @@ package org.dicio.numbers.lang.de; -import org.dicio.numbers.NumberParserFormatter; +import org.dicio.numbers.ParserFormatter; import org.junit.BeforeClass; import org.junit.Test; @@ -12,11 +12,11 @@ public class NiceNumberTest { - private static NumberParserFormatter pf; + private static ParserFormatter pf; @BeforeClass public static void setup() { - pf = new NumberParserFormatter(new GermanFormatter(), null); + pf = new ParserFormatter(new GermanFormatter(), null); } @Test