gp_data_prep.sh 4.55 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124


#!/bin/bash -u

# Copyright 2012  Arnab Ghoshal

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

set -o errexit

function error_exit () {
  echo -e "$@" >&2; exit 1;
}

function read_dirname () {
  local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
  [ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory";
  local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
  echo $retval
}

PROG=`basename $0`;
usage="Usage: $PROG <arguments>\n
Prepare train, dev, eval file lists for a language.\n\n
Required arguments:\n
  --config-dir=DIR\tDirecory containing the necessary config files\n
  --corpus-dir=DIR\tDirectory for the GlobalPhone corpus\n
  --lm-dir=DIR\t\tDirectory containing language models\n
  --work-dir=DIR\t\tWorking directory\n
";

if [ $# -lt 4 ]; then
  error_exit $usage;
fi

while [ $# -gt 0 ];
do
  case "$1" in
  --help) echo -e $usage; exit 0 ;;
  --config-dir=*)
  CONFDIR=`read_dirname $1`; shift ;;
  --corpus-dir=*)
  GPDIR=`read_dirname $1`; shift ;;
  --lm-dir=*)
  LMDIR=`read_dirname $1`; shift ;;
  --work-dir=*)
  WDIR=`read_dirname $1`; shift ;;
  *)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
  esac
done

# (1) check if the config files are in place:
cd $CONFDIR
[ -f dev_spk.list ] || error_exit "$PROG: Dev-set speaker list not found.";
[ -f eval_spk.list ] || error_exit "$PROG: Eval-set speaker list not found.";
[ -f lang_codes.txt ] || error_exit "$PROG: Mapping for language name to 2-letter code not found.";

cd $WDIR
[ -f path.sh ] && . ./path.sh  # Sets the PATH to contain necessary executables

# (2) get the various file lists (for audio, transcription, etc.) for the
# specified language.
for LCODE in GE PO SP SW; do
  mkdir -p data/$LCODE
  gp_prep_flists.sh --corpus-dir=$GPDIR --dev-spk=$CONFDIR/dev_spk.list \
    --eval-spk=$CONFDIR/eval_spk.list --lang-map=$CONFDIR/lang_codes.txt \
    --work-dir=data $LCODE 2>data/$LCODE/prep_flists.log & 
  # Running these in parallel since this does audio conversion (to figure out
  # which files cannot be processed) and takes some time to run. 
done
wait;

# (3) Normalize the dictionary and transcripts.
for LCODE in GE PO SP SW; do
  full_name=`awk '/'$LCODE'/ {print $2}' $CONFDIR/lang_codes.txt`;
  gp_norm_dict_${LCODE}.pl -i $GPDIR/Dictionaries/${LCODE}/${full_name}-GPDict.txt | sort -u > data/$LCODE/local/lexicon_nosil_${LCODE}.txt
  (echo -e '!SIL\tSIL\n<UNK>\tSPN';) \
    | cat - data/$LCODE/local/lexicon_nosil_${LCODE}.txt \
    > data/$LCODE/local/lexicon_${LCODE}.txt;
  
  # add disambig symbols to the lexicon:
  ndisambig=`add_lex_disambig.pl data/$LCODE/local/lexicon_${LCODE}.txt data/$LCODE/local/lexicon_disambig_${LCODE}.txt`
  ndisambig=$[$ndisambig+1];  # add one disambig symbol for silence
  echo $ndisambig > data/$LCODE/local/lex_ndisambig

  # Get the list of phones and map them to integers (adding silence and spoken
  # nosie to the list).
  cut -f2 data/$LCODE/local/lexicon_nosil_${LCODE}.txt | sed -e "s?_.*??g" \
    | tr ' ' '\n' | sort -u \
    | awk 'BEGIN{ print "<eps> 0"; print "SIL 1"; print "SPN 2"; N=3; } 
           { printf("%s %d\n", $1, N++); }' > data/$LCODE/local/phones.txt
  # If using word-boundary markers on phones, use this in the awk command above
           # { printf("%s_WB %d\n", $1, N++); }
  # If using position markers on phones, use these in the awk command above
           # { printf("%s_B %d\n", $1, N++); }
           # { printf("%s_E %d\n", $1, N++); }
           # { printf("%s_S %d\n", $1, N++); }

  # Get the list of words:
  cut -f1 data/$LCODE/local/lexicon_${LCODE}.txt | sort -u \
    | awk 'BEGIN{print "<eps> 0";} {printf("%s %d\n", $1, NR);} 
           END{printf("#0 %d\n", NR+1);}' > data/$LCODE/local/words.txt

  for x in train dev eval; do
    gp_norm_trans_${LCODE}.pl -i data/$LCODE/local/${x}_${LCODE}.trans \
      > data/$LCODE/local/${x}_${LCODE}.trans2;
  done

done

# (4) Normalize the LMs - this is very Edinburgh-specific since we have some 
# LMs that came with the GlobalPhone corpus.
gp_prep_lms_edin.sh --lm-dir=$LMDIR --work-dir=$WDIR

echo "Finished data preparation."