Blame view
egs/librispeech/s5/local/lm/normalize_text.sh
1.51 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
#!/bin/bash # Copyright 2014 Vassil Panayotov # Apache 2.0 # Performs text normalization for subsequent language model training echo $@ . ./path.sh || exit 1 if [[ $# -ne 2 ]]; then echo "Usage: $0 <input-book-dirs> <output-root>" exit 1 fi in_list=$1 out_root=$2 [[ -f "$in_list" ]] || { echo "The input file '$in_list' does not exists!"; exit 1; } command -v nsw_expand 1>/dev/null 2>&1 || { echo "" echo "The Festival's NSW text normalization package is not found in PATH"; echo "You can try to install it by running:"; echo " local/lm/install_festival.sh [--apply-gcc-patch false]"; echo "Note however, that this script should only be considered as an example,"; echo "so if you run into installation problems, it's up to you to resolve them"; exit 1; } mkdir -p $out_root processed=0 for b in $(cat $in_list); do id=$(basename $b) echo "Start processing $id at $(date '+%T %F')" in_file=$b/$id.txt [[ -f "$in_file" ]] || { echo "WARNING: $in_file does not exists"; continue; } out_file=$out_root/$id/$id.txt mkdir -p $out_root/$id $PYTHON local/lm/python/pre_filter.py $in_file /dev/stdout |\ $PYTHON local/lm/python/text_pre_process.py /dev/stdin /dev/stdout |\ nsw_expand -format opl /dev/stdin |\ $PYTHON local/lm/python/text_post_process.py /dev/stdin $out_file /dev/null || exit 1 processed=$((processed + 1)) echo "Processing of $id has finished at $(date '+%T %F') [$processed texts ready so far]" done echo "$processed texts processed OK and stored under '$out_root'" exit 0 |