rm_data_prep.sh 4.3 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112


#!/bin/bash
#
# Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# To be run from one directory above this script.

# The input is the 3 CDs from the LDC distribution of Resource Management.
# The script's argument is a directory which has three subdirectories:
# rm1_audio1  rm1_audio2  rm2_audio

# Note: when creating your own data preparation scripts, it's a good idea
# to make sure that the speaker id (if present) is a prefix of the utterance
# id, that the output scp file is sorted on utterance id, and that the 
# transcription file is exactly the same length as the scp file and is also
# sorted on utterance id (missing transcriptions should be removed from the
# scp file using e.g. scripts/filter_scp.pl)

if [ $# != 1 ]; then
  echo "Usage: ../../local/RM_data_prep.sh /path/to/RM"
  exit 1; 
fi 

export LC_ALL=C

RMROOT=$1

tmpdir=data/local/tmp
mkdir -p $tmpdir
. ./path.sh || exit 1; # for KALDI_ROOT

if [ ! -d $RMROOT/rm1_audio1 -o ! -d $RMROOT/rm1_audio2 ]; then
   echo "Error: rm_data_prep.sh requires a directory argument (an absolute pathname) that contains rm1_audio1 and rm1_audio2"
   exit 1; 
fi  

if [ ! -d $RMROOT/rm2_audio ]; then
   echo "**Warning: $RMROOT/rm2_audio does not exist; won't create spk2gender file correctly***"
   sleep 1
fi  

(
    find $RMROOT/rm1_audio1/rm1/ind_trn -iname '*.sph';
    find $RMROOT/rm1_audio2/2_4_2/rm1/ind/dev_aug -iname '*.sph';
) | perl -ane ' m:/sa\d.sph:i || m:/sb\d\d.sph:i || print; '  > $tmpdir/train_sph.flist


dir=data/train
mkdir -p $dir

# make_trans.pl also creates the utterance id's and the kaldi-format scp file.
local/make_trans.pl trn $tmpdir/train_sph.flist $RMROOT/rm1_audio1/rm1/doc/al_sents.snr >(sort -k1 >$dir/text) \
  >(sort -k1 >$dir/sph.scp)
sleep 0.25 # At one point I had the next line failing because $dir/sph.scp appeared not
           # to exist.  Adding this sleep statement appeared to fix the problem.


sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
[ ! -f $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;

awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' <$dir/sph.scp > $dir/wav.scp
rm $dir/sph.scp

cat $dir/wav.scp | perl -ane 'm/^((\w+)\w_\w+_\w+) / || die; print "$1 $2\n"' > $dir/utt2spk
utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt


for ntest in 1_mar87 2_oct87 4_feb89 5_oct89 6_feb91 7_sep92; do
  n=`echo $ntest | cut -d_ -f 1` # e.g. n = 1, 2, 4, 5..
  test=`echo $ntest | cut -d_ -f 2` # e.g. test=mar87, oct87...
  dir=data/test_${test}
  mkdir -p $dir
  root=$RMROOT/rm1_audio2/2_4_2
  for x in `grep -v ';' $root/rm1/doc/tests/$ntest/${n}_indtst.ndx`; do
    echo "$root/$x ";
  done | sort > $dir/sph.flist

  local/make_trans.pl ${test} $dir/sph.flist $RMROOT/rm1_audio1/rm1/doc/al_sents.snr \
     >(sort -k1 >$dir/text) >(sort -k1 >$dir/sph.scp)
  sleep 0.25 # At one point I had the next line failing because $dir/sph.scp appeared not
             # to exist.  Adding this sleep statement appeared to fix the problem.
  awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' <$dir/sph.scp >$dir/wav.scp
  rm $dir/sph.flist $dir/sph.scp

  cat $dir/wav.scp | perl -ane 'm/^((\w+)\w_\w+_\w+) / || die; print "$1 $2\n"' > $dir/utt2spk
  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
done

cat $RMROOT/rm1_audio2/2_5_1/rm1/doc/al_spkrs.txt \
    $RMROOT/rm2_audio/3-1.2/rm2/doc/al_spkrs.txt | \
    perl -ane 'tr/A-Z/a-z/;print;' | grep -v ';' | \
    awk '{print $1, $2}' | sort | uniq > $tmpdir/spk2gender || exit 1;

for t in train test_mar87 test_oct87 test_feb89 test_oct89 test_feb91 test_sep92; do
  utils/filter_scp.pl data/$t/spk2utt $tmpdir/spk2gender >data/$t/spk2gender
done

local/make_rm_lm.pl $RMROOT/rm1_audio1/rm1/doc/wp_gram.txt  > $tmpdir/G.txt || exit 1;

mkdir -p data/local/dict

# Getting lexicon
local/make_rm_dict.pl  $RMROOT/rm1_audio2/2_4_2/score/src/rdev/pcdsril.txt \
   > data/local/dict/lexicon.txt || exit 1;

# Get phone lists...
grep -v -w sil data/local/dict/lexicon.txt | \
  awk '{for(n=2;n<=NF;n++) { p[$n]=1; }} END{for(x in p) {print x}}' | sort > data/local/dict/nonsilence_phones.txt
echo sil > data/local/dict/silence_phones.txt
echo sil > data/local/dict/optional_silence.txt
touch data/local/dict/extra_questions.txt # no extra questions, as we have no stress or tone markers.

echo RM_data_prep succeeded.