apply_g2p.sh 3.03 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89


#!/bin/bash
# Copyright 2014  Johns Hopkins University (Author: Yenda Trmal)
# Copyright 2016  Xiaohui Zhang
# Apache 2.0

# Begin configuration section.  
stage=0
encoding='utf-8'
var_counts=3  #Generate upto N variants
var_mass=0.9  #Generate so many variants to produce 90 % of the prob mass
cmd=run.pl
nj=10          #Split the task into several parallel, to speedup things
model=
# End configuration section.

echo "$0 $@"  # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

set -u
set -e

if [ $# != 3 ]; then
   echo "Usage: $0 [options] <word-list> <g2p-model-dir> <output-dir>"
   echo "... where <word-list> is a list of words whose pronunciation is to be generated"
   echo "          <g2p-model-dir> is a directory used as a target during training of G2P"
   echo "          <output-dir> is the directory where the output lexicon should be stored"
   echo "e.g.: $0 oov_words exp/g2p exp/g2p/oov_lex"
   echo ""
   echo "main options (for others, see top of script file)"
   echo "  --nj <int>                                    # How many tasks should be spawn (to speedup things)"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
fi

wordlist=$1
modeldir=$2
output=$3


mkdir -p $output/log

model=$modeldir/g2p.model.final
[ ! -f ${model:-} ] && echo "File $model not found in the directory $modeldir." && exit 1
#[ ! -x $wordlist ] && echo "File $wordlist not found!" && exit 1

cp $wordlist $output/wordlist.txt

if ! g2p=`which g2p.py` ; then
  echo "The Sequitur was not found !"
  echo "Go to $KALDI_ROOT/tools and execute extras/install_sequitur.sh"
  exit 1
fi

echo "Applying the G2P model to wordlist $wordlist"

if [ $stage -le 0 ]; then
  $cmd JOBS=1:$nj $output/log/apply.JOBS.log \
    split -n l/JOBS/$nj $output/wordlist.txt \| \
    g2p.py -V $var_mass --variants-number $var_counts --encoding $encoding \
      --model $modeldir/g2p.model.final --apply - \
    \> $output/output.JOBS
fi
cat $output/output.* > $output/output

# Remap the words from output file back to the original casing
# Conversion of some of thems might have failed, so we have to be careful
# and use the transform_map file we generated beforehand
# Also, because the sequitur output is not readily usable as lexicon (it adds 
# one more column with ordering of the pron. variants) convert it into the proper lexicon form
output_lex=$output/lexicon.lex

# Just convert it to a proper lexicon format
cut -f 1,3,4 $output/output > $output_lex

# Some words might have been removed or skipped during the process,
# let's check it and warn the user if so...
nlex=`cut -f 1 $output_lex | sort -u | wc -l`
nwlist=`cut -f 1 $output/wordlist.txt | sort -u | wc -l`
if [ $nlex -ne $nwlist ] ; then
  echo "WARNING: Unable to generate pronunciation for all words. ";
  echo "WARINNG:   Wordlist: $nwlist words"
  echo "WARNING:   Lexicon : $nlex words"
  echo "WARNING:Diff example: "
  diff <(cut -f 1 $output_lex | sort -u ) \
       <(cut -f 1 $output/wordlist.txt | sort -u ) || true
fi
exit 0