Blame view
egs/wsj/s5/utils/lang/bpe/bidi.py
2.07 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
#!/usr/bin/env python3 # Copyright 2018 Chun-Chieh Chang # This script is largely written by Stephen Rawls # and uses the python package https://pypi.org/project/PyICU_BiDi/ # The code leaves right to left text alone and reverses left to right text. import icu_bidi import io import sys import unicodedata # R=strong right-to-left; AL=strong arabic right-to-left rtl_set = set(chr(i) for i in range(sys.maxunicode) if unicodedata.bidirectional(chr(i)) in ['R','AL']) def determine_text_direction(text): # Easy case first for char in text: if char in rtl_set: return icu_bidi.UBiDiLevel.UBIDI_RTL # If we made it here we did not encounter any strongly rtl char return icu_bidi.UBiDiLevel.UBIDI_LTR def utf8_visual_to_logical(text): text_dir = determine_text_direction(text) bidi = icu_bidi.Bidi() bidi.inverse = True bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_INVERSE_LIKE_DIRECT bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT # icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS bidi.set_para(text, text_dir, None) res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING) return res def utf8_logical_to_visual(text): text_dir = determine_text_direction(text) bidi = icu_bidi.Bidi() bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_DEFAULT bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT #icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS bidi.set_para(text, text_dir, None) res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING) return res ##main## sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8") sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8") for line in sys.stdin: line = line.strip() line = utf8_logical_to_visual(line)[::-1] sys.stdout.write(line + ' ') |