Blame view
egs/librispeech/s5/local/g2p/train_g2p.sh
2.78 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
#!/bin/bash # Copyright 2014 Vassil Panayotov # Apache 2.0 # Trains Sequitur G2P models on CMUdict # can be used to skip some of the initial steps stage=1 . utils/parse_options.sh || exit 1 . ./path.sh || exit 1 if [ $# -ne "2" ]; then echo "Usage: $0 <cmudict-download-dir> <g2p-dir>" echo "e.g.: $0 data/local/dict/cmudict data/local/g2p_model" exit 1 fi cmudict_dir=$1 g2p_dir=$2 mkdir -p $cmudict_dir mkdir -p $g2p_dir cmudict_plain=$g2p_dir/cmudict.0.7a.plain cmudict_clean=$g2p_dir/cmudict.0.7a.clean if [ $stage -le 1 ]; then echo "Downloading and preparing CMUdict" if [ ! -s $cmudict_dir/cmudict.0.7a ]; then svn co -r 12440 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $cmudict_dir || exit 1; else echo "CMUdict copy found in $cmudict_dir - skipping download!" fi fi if [ $stage -le 2 ]; then echo "Removing the pronunciation variant markers ..." grep -v ';;;' $cmudict_dir/cmudict.0.7a | \ perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \ > $cmudict_plain || exit 1; echo "Removing special pronunciations(not helpful for G2P modelling)..." egrep -v '^[^A-Z]' $cmudict_plain >$cmudict_clean fi model_1=$g2p_dir/model-1 if [ $stage -le 3 ]; then echo "Training first-order G2P model (log in '$g2p_dir/model-1.log') ..." PYTHONPATH=$sequitur_path:$PYTHONPATH $PYTHON $sequitur \ --train $cmudict_clean --devel 5% --write-model $model_1 >$g2p_dir/model-1.log 2>&1 || exit 1 fi model_2=$g2p_dir/model-2 if [ $stage -le 4 ]; then echo "Training second-order G2P model (log in '$g2p_dir/model-2.log') ..." PYTHONPATH=$sequitur_path:$PYTHONPATH $PYTHON $sequitur \ --model $model_1 --ramp-up --train $cmudict_clean \ --devel 5% --write-model $model_2 >$g2p_dir/model-2.log \ >$g2p_dir/model-2.log 2>&1 || exit 1 fi model_3=$g2p_dir/model-3 if [ $stage -le 5 ]; then echo "Training third-order G2P model (log in '$g2p_dir/model-3.log') ..." PYTHONPATH=$sequitur_path:$PYTHONPATH $PYTHON $sequitur \ --model $model_2 --ramp-up --train $cmudict_clean \ --devel 5% --write-model $model_3 \ >$g2p_dir/model-3.log 2>&1 || exit 1 fi model_4=$g2p_dir/model-4 if [ $stage -le 4 ]; then echo "Training fourth-order G2P model (log in '$g2p_dir/model-4.log') ..." PYTHONPATH=$sequitur_path:$PYTHONPATH $PYTHON $sequitur \ --model $model_3 --ramp-up --train $cmudict_clean \ --devel 5% --write-model $model_4 \ >$g2p_dir/model-4.log 2>&1 || exit 1 fi model_5=$g2p_dir/model-5 if [ $stage -le 5 ]; then echo "Training fifth-order G2P model (log in '$g2p_dir/model-5.log') ..." PYTHONPATH=$sequitur_path:$PYTHONPATH $PYTHON $sequitur \ --model $model_4 --ramp-up --train $cmudict_clean \ --devel 5% --write-model $model_5 \ >$g2p_dir/model-5.log 2>&1 || exit 1 fi echo "G2P training finished OK!" exit 0 |