diff --git a/normalization/languages/__init__.py b/normalization/languages/__init__.py index 18e07a6..905c82f 100644 --- a/normalization/languages/__init__.py +++ b/normalization/languages/__init__.py @@ -1,7 +1,15 @@ -from . import english, french, german, italian, spanish +from . import dutch, english, french, german, italian, spanish from .base import LanguageOperators from .registry import get_language_registry, register_language register_language(LanguageOperators) -__all__ = ["english", "french", "german", "italian", "spanish", "get_language_registry"] +__all__ = [ + "dutch", + "english", + "french", + "german", + "italian", + "spanish", + "get_language_registry", +] diff --git a/normalization/languages/dutch/__init__.py b/normalization/languages/dutch/__init__.py new file mode 100644 index 0000000..8be3be6 --- /dev/null +++ b/normalization/languages/dutch/__init__.py @@ -0,0 +1,7 @@ +from .operators import DutchOperators +from .replacements import DUTCH_REPLACEMENTS + +__all__ = [ + "DutchOperators", + "DUTCH_REPLACEMENTS", +] diff --git a/normalization/languages/dutch/number_normalizer.py b/normalization/languages/dutch/number_normalizer.py new file mode 100644 index 0000000..73653ee --- /dev/null +++ b/normalization/languages/dutch/number_normalizer.py @@ -0,0 +1,144 @@ +"""Dutch number normalizer using text2num's alpha2digit. + +Converts spelled-out numbers to digits (e.g. vijf en twintig → 25) and handles +mixed digit+word forms (e.g. 3 miljard → drie miljard) before conversion so +alpha2digit does not misinterpret them. Optionally rewrites currency symbols to +amount + spoken singular unit, then restores plural trailing words from config. +""" + +import re + +from text_to_num import alpha2digit + +# Digit-to-Dutch-word mapping for normalizing "3 miljard" → "drie miljard". +_DIGIT_TO_DUTCH: dict[str, str] = { + "0": "nul", + "1": "een", + "2": "twee", + "3": "drie", + "4": "vier", + "5": "vijf", + "6": "zes", + "7": "zeven", + "8": "acht", + "9": "negen", +} + +# Pattern: digit(s) followed by Dutch large-number multipliers. +_RE_MIXED_NUMBER = re.compile( + r"\b(\d+)\s+(miljoen|miljoenen|miljard|miljarden|biljoen|biljoenen)\b", + re.IGNORECASE, +) + + +def _normalize_mixed_numbers(text: str) -> str: + """Convert '3 miljard' → 'drie miljard' so alpha2digit yields 3e9, not '3 1000000000'. + + alpha2digit may concatenate a lone digit with the following word; converting + the digit to a word avoids that (e.g. 'drie miljard' → 3000000000). + """ + + def replace(match: re.Match) -> str: + number = match.group(1) + multiplier = match.group(2) + if len(number) == 1 and number in _DIGIT_TO_DUTCH: + return f"{_DIGIT_TO_DUTCH[number]} {multiplier}" + # Multi-digit: keep as-is; alpha2digit will handle or leave unchanged + return match.group(0) + + return _RE_MIXED_NUMBER.sub(replace, text) + + +def _singular_spoken_unit(trailing_word: str) -> str: + """Map ``currency_symbol_to_word`` value to a spoken singular alpha2digit accepts.""" + t = trailing_word.lower() + if t == "euros": + return "euro" + if t == "dollars": + return "dollar" + if t == "ponden": + return "pond" + if t == "yens": + return "yen" + return trailing_word + + +def _normalize_currency_symbols( + text: str, + currency_symbol_to_word: dict[str, str] | None, +) -> str: + if not currency_symbol_to_word: + return text + num = r"\d+(?:[.,]\d+)?" + for symbol, trailing in currency_symbol_to_word.items(): + singular = _singular_spoken_unit(trailing) + esc = re.escape(symbol) + text = re.sub(rf"{esc}\s*({num})", rf"\1 {singular}", text, flags=re.IGNORECASE) + text = re.sub(rf"({num})\s*{esc}", rf"\1 {singular}", text, flags=re.IGNORECASE) + return text + + +def _currency_plural_fix_patterns( + currency_symbol_to_word: dict[str, str] | None, +) -> tuple[tuple[re.Pattern[str], str], ...]: + """Build (pattern, replacement) pairs so digit + alpha2digit singular → config trailing word.""" + if not currency_symbol_to_word: + return () + amount = r"(\d+(?:[.,]\d+)?)" + seen: set[str] = set() + out: list[tuple[re.Pattern[str], str]] = [] + for _symbol, trailing in currency_symbol_to_word.items(): + tl = trailing.lower() + if tl in seen: + continue + seen.add(tl) + singular = _singular_spoken_unit(trailing) + if singular.lower() == tl: + continue + if tl == "euros": + pat = re.compile(rf"\b{amount}\s+euro(?:'s)?\b", re.IGNORECASE) + out.append((pat, rf"\1 {trailing}")) + else: + pat = re.compile( + rf"\b{amount}\s+{re.escape(singular)}\b", + re.IGNORECASE, + ) + out.append((pat, rf"\1 {trailing}")) + return tuple(out) + + +def _apply_currency_plural_fixes( + text: str, + fixers: tuple[tuple[re.Pattern[str], str], ...], +) -> str: + for pattern, repl in fixers: + text = pattern.sub(repl, text) + return text + + +class DutchNumberNormalizer: + """Convert Dutch spelled-out numbers to digits via text2num.alpha2digit. + + Applies pre-passes for currency symbols (when configured) and mixed digit+word + forms (e.g. 3 miljard) before calling alpha2digit, then normalizes currency + words to the plural forms in ``currency_symbol_to_word``. + """ + + def __init__(self, currency_symbol_to_word: dict[str, str] | None = None) -> None: + if alpha2digit is None: + raise ImportError( + "Dutch number normalization requires the text2num package. " + "Install it with: uv add text2num" + ) + self._alpha2digit = alpha2digit + self._currency_symbol_to_word = currency_symbol_to_word + self._currency_plural_fixes = _currency_plural_fix_patterns( + currency_symbol_to_word, + ) + + def __call__(self, text: str) -> str: + text = _normalize_currency_symbols(text, self._currency_symbol_to_word) + text = _normalize_mixed_numbers(text) + text = self._alpha2digit(text, "nl") + text = _apply_currency_plural_fixes(text, self._currency_plural_fixes) + return text diff --git a/normalization/languages/dutch/operators.py b/normalization/languages/dutch/operators.py new file mode 100644 index 0000000..2f7b556 --- /dev/null +++ b/normalization/languages/dutch/operators.py @@ -0,0 +1,116 @@ +import re + +from normalization.languages.base import ( + LanguageConfig, + LanguageOperators, +) +from normalization.languages.dutch.number_normalizer import DutchNumberNormalizer +from normalization.languages.dutch.sentence_replacements import ( + DUTCH_SENTENCE_REPLACEMENTS, +) +from normalization.languages.registry import register_language + +# Flemish apostrophe clitics (straight or typographic apostrophe). (?": "groter dan", + "<": "kleiner dan", + "°": "graden", + "°C": "graden celsius", + "°F": "graden fahrenheit", + "%": "procent", + }, + currency_symbol_to_word={ + "€": "euros", + "$": "dollars", + "£": "ponden", + "¢": "cent", + "¥": "yens", + }, + filler_words=[ + "ah", + "allee", + "alee", + "eh", + "ehm", + "hé", + "hè", + "he", + "hm", + "hmm", + "mm", + "mmm", + "mhm", + "nou", + "o", + "oke", + "okee", + "oké", + "uh", + ], + sentence_replacements=DUTCH_SENTENCE_REPLACEMENTS, +) + + +@register_language +class DutchOperators(LanguageOperators): + def __init__(self): + super().__init__(DUTCH_CONFIG) + self._number_normalizer = DutchNumberNormalizer( + DUTCH_CONFIG.currency_symbol_to_word, + ) + + def expand_written_numbers(self, text: str) -> str: + """Convert Dutch spelled-out numbers to digits (vijf en twintig → 25). + + Uses DutchNumberNormalizer, which normalizes currency symbols and mixed forms + (3 miljard → drie miljard), then text2num.alpha2digit. + """ + return self._number_normalizer(text) + + def expand_contractions(self, text: str) -> str: + def _temporal_sub(m: re.Match[str]) -> str: + return f"des{m.group(1)}{m.group(2).lower()}" + + text = _RE_TEMPORAL_S.sub(_temporal_sub, text) + text = _RE_CLITIC_S.sub("is", text) + + def _clitic_sub(m: re.Match[str]) -> str: + return _CLITIC_LETTER_TO_WORD[m.group(1).lower()] + + text = _RE_CLITIC_TRNKM.sub(_clitic_sub, text) + return text + + def get_word_replacements(self) -> dict[str, str]: + from normalization.languages.dutch.replacements import DUTCH_REPLACEMENTS + + return DUTCH_REPLACEMENTS diff --git a/normalization/languages/dutch/replacements.py b/normalization/languages/dutch/replacements.py new file mode 100644 index 0000000..af45bed --- /dev/null +++ b/normalization/languages/dutch/replacements.py @@ -0,0 +1,28 @@ +"""Single-token Flemish / colloquial → standard Dutch (canonical for WER).""" + +DUTCH_REPLACEMENTS: dict[str, str] = { + # Flemish dialect → standard Dutch + "ge": "je", + "da": "dat", + "ne": "een", + "efkes": "even", + "effe": "even", + "awel": "wel", + "den": "de", + "mijne": "mijn", + "gij": "jij", + "zij": "ze", + "zijne": "zijn", + # Bare clitics (apostrophe dropped by ASR) + "t": "het", + "s": "is", + "r": "er", + "k": "ik", + # Formal / informal pronoun conflation (Flemish customer service) + # ref uses formal u/uw; models transcribe je — normalise to je + "u": "je", + "uw": "je", + # Spelling variants → canonical + "okee": "oke", # oke is already in filler_words; okee must map to it + "euro": "euros", # collapse singular/plural +} diff --git a/normalization/languages/dutch/sentence_replacements.py b/normalization/languages/dutch/sentence_replacements.py new file mode 100644 index 0000000..fd5cab9 --- /dev/null +++ b/normalization/languages/dutch/sentence_replacements.py @@ -0,0 +1,9 @@ +"""Multi-word and phrase-level normalization for Dutch (incl. Flemish variants).""" + +DUTCH_SENTENCE_REPLACEMENTS: dict[str, str] = { + "fifty fifty": "5050", + "fiftyfifty": "5050", + "checks": "cheques", + "goeiemiddag": "goedemiddag", + "kollega": "collega", +} diff --git a/tests/e2e/files/gladia-3/nl.csv b/tests/e2e/files/gladia-3/nl.csv new file mode 100644 index 0000000..0f41e50 --- /dev/null +++ b/tests/e2e/files/gladia-3/nl.csv @@ -0,0 +1,25 @@ +input,expected +tien euro,10 euros +2 < 5,2 kleiner dan 5 +50°C,50 graden celsius +ca kost €50,ca kost 50 euros +"1.234,56",1234 komma 56 +dertien appels,13 appels +kollega zegt hallo,collega zegt hallo +ge weet da,je weet dat +ik zeg 't zo,ik zeg het zo +honderd euro,100 euros +vijf dollar,5 dollars +honderd euro's,100 euros +"3,14",3 komma 14 +192.168.1.1,192 punt 168 punt 1 punt 1 +test@example.com,test apenstaartje example punt com +www.example.com,w w w punt example punt com +x = 5,x gelijk aan 5 +Het woord [inaudible] is hier,het woord inaudible is hier +hallo eh daar,hallo daar +mein naam is Bob,mein naam is bob +twee duizend,2000 +'s ochtends vroeg,des ochtends vroeg +ping pong,ping pong +vijf en twintig euro,25 euros diff --git a/tests/unit/languages/dutch_number_normalizer_test.py b/tests/unit/languages/dutch_number_normalizer_test.py new file mode 100644 index 0000000..d039ed0 --- /dev/null +++ b/tests/unit/languages/dutch_number_normalizer_test.py @@ -0,0 +1,68 @@ +import pytest + +from normalization.languages.dutch.number_normalizer import DutchNumberNormalizer +from normalization.languages.dutch.operators import DUTCH_CONFIG + + +@pytest.fixture +def normalizer() -> DutchNumberNormalizer: + return DutchNumberNormalizer(DUTCH_CONFIG.currency_symbol_to_word) + + +@pytest.fixture +def normalizer_no_currency() -> DutchNumberNormalizer: + return DutchNumberNormalizer(None) + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + ("vijf en twintig", "25"), + ("tweehonderd eenendertig", "231"), + ("drie miljard", "3000000000"), + ("3 miljard", "3000000000"), + ("2 miljoen", "2000000"), + ("2 MILJOEN", "2000000"), + ], +) +def test_alpha2digit_dutch_spelling_and_large_numbers( + normalizer: DutchNumberNormalizer, text: str, expected: str +): + assert normalizer(text) == expected + + +def test_multi_digit_then_miljoen_not_fully_merged(normalizer: DutchNumberNormalizer): + """Multi-digit + multiplier is left for alpha2digit; digit is not rewritten to a word.""" + assert normalizer("12 miljoen") == "12 1000000" + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + ("tien euro", "10 euros"), + ("honderd euro's", "100 euros"), + ("€10", "10 euros"), + ("10 €", "10 euros"), + ("vijf dollar", "5 dollars"), + ("$3.50", "3.50 dollars"), + ("£5", "5 ponden"), + ("¥200", "200 yens"), + ], +) +def test_currency_symbols_and_plural_trailing_words( + normalizer: DutchNumberNormalizer, text: str, expected: str +): + assert normalizer(text) == expected + + +def test_without_currency_config_leaves_currency_symbol( + normalizer_no_currency: DutchNumberNormalizer, +): + assert normalizer_no_currency("vijf en twintig") == "25" + assert normalizer_no_currency("€10") == "€10" + assert normalizer_no_currency("3 miljard") == "3000000000" + + +def test_non_numeric_text_unchanged(normalizer: DutchNumberNormalizer): + text = "dit is gewone tekst" + assert normalizer(text) == text diff --git a/tests/unit/languages/dutch_operators_test.py b/tests/unit/languages/dutch_operators_test.py new file mode 100644 index 0000000..d639092 --- /dev/null +++ b/tests/unit/languages/dutch_operators_test.py @@ -0,0 +1,51 @@ +import pytest + +from normalization.languages.dutch.operators import DutchOperators +from normalization.languages.registry import get_language_registry + + +@pytest.fixture +def operators(): + return DutchOperators() + + +def test_dutch_is_registered(): + assert "nl" in get_language_registry() + + +def test_dutch_registry_produces_dutch_operators(): + instance = get_language_registry()["nl"]() + assert isinstance(instance, DutchOperators) + + +def test_expand_flemish_clitics(operators): + assert operators.expand_contractions("ik zeg 't zo") == "ik zeg het zo" + assert operators.expand_contractions("zeg ’t zo") == "zeg het zo" + assert operators.expand_contractions("'k kom morgen") == "ik kom morgen" + assert operators.expand_contractions("is 'r nog") == "is er nog" + assert operators.expand_contractions("'n beetje") == "een beetje" + assert operators.expand_contractions("zie je 'm") == "zie je hem" + assert operators.expand_contractions("dat 's goed") == "dat is goed" + + +def test_expand_clitic_s_not_possessive_after_word(operators): + assert operators.expand_contractions("Jan's auto") == "Jan's auto" + + +def test_expand_temporal_s_to_des(operators): + assert operators.expand_contractions("'s ochtends vroeg") == "des ochtends vroeg" + assert operators.expand_contractions("'S Avonds laat") == "des avonds laat" + + +def test_config_sentence_replacements(operators): + assert operators.config.sentence_replacements is not None + assert operators.config.sentence_replacements["kollega"] == "collega" + + +def test_word_replacements(operators): + assert operators.get_word_replacements()["ge"] == "je" + assert operators.get_word_replacements()["da"] == "dat" + assert operators.get_word_replacements()["u"] == "je" + assert operators.get_word_replacements()["uw"] == "je" + assert operators.get_word_replacements()["okee"] == "oke" + assert operators.get_word_replacements()["euro"] == "euros" diff --git a/tests/unit/steps/text/conftest.py b/tests/unit/steps/text/conftest.py index 81c4f15..e515c55 100644 --- a/tests/unit/steps/text/conftest.py +++ b/tests/unit/steps/text/conftest.py @@ -1,6 +1,7 @@ import pytest from normalization.languages.base import LanguageOperators +from normalization.languages.dutch import DutchOperators from normalization.languages.english import EnglishOperators from normalization.languages.french import FrenchOperators from normalization.steps import get_step_registry @@ -21,6 +22,11 @@ def french_operators(): return FrenchOperators() +@pytest.fixture +def dutch_operators(): + return DutchOperators() + + def assert_text_step_registered(step_cls): """Verify a text step is properly registered under its name.""" registry = get_step_registry()