generate_uniformly_segmented_data_dir.sh 2.66 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83


#!/bin/bash

# Copyright Vijayaditya Peddinti, 2016.
# Apache 2.0.
# This script generates uniformly segmented data dir, if the directory
# already has a segments file (e.g. in data/dev_aspire) we create directory
# without segments and then uniformly segment it.
# It also extracts hires mfcc features

set -e
set -x

stage=1
num_jobs=30
overlap=5 # size of the overlap
window=10 # size of the uniform segment

. ./cmd.sh
[ -f ./path.sh ] && . ./path.sh
. utils/parse_options.sh || exit 1;

if [ $# -ne 2 ]; then
  echo "Usage: $0 [options] <data-set> <out-data-set>"
  echo " Options:"
  echo "    --stage (1|2|3)  # start scoring script from part-way through."
  echo "e.g.:"
  echo "$0 data/train data/lang exp/nnet3/tdnn"
  exit 1;
fi

data_set=$1
segmented_data_set=$2

if [ "$data_set" == "dev_aspire" ]; then
  if [ $stage -le 1 ]; then
    echo "$0: Creating the data dir with whole recordings without segmentation"
    # create a whole directory without the segments
    unseg_dir=data/${data_set}_whole_hires
    src_dir=data/${data_set}
    utils/data/convert_data_dir_to_whole.sh $src_dir $unseg_dir

    echo "$0: Creating the $unseg_dir/reco2file_and_channel file"
    cat $unseg_dir/wav.scp | awk '{print $1, $1, "A";}' > $unseg_dir/reco2file_and_channel
  fi
  data_set=${data_set}_whole
else
  utils/copy_data_dir.sh data/$data_set data/${data_set}_hires
fi

if [ $stage -le 2 ]; then
  echo "$0: Extracting features"
  steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
    --mfcc-config conf/mfcc_hires.conf data/${data_set}_hires

  steps/compute_cmvn_stats.sh data/${data_set}_hires

  utils/fix_data_dir.sh data/${data_set}_hires
  utils/validate_data_dir.sh --no-text data/${data_set}_hires
fi

if [ $stage -le 3 ]; then
  echo "$0: Generating uniform segments with length $window and overlap $overlap."
  [ -d data/${segmented_data_set}_hires ] && rm -r data/${segmented_data_set}_hires
  if [ ! -f data/${data_set}_hires/segments ]; then
    utils/data/get_segments_for_data.sh data/${data_set}_hires > \
      data/${data_set}_hires/segments
  fi

  mkdir -p data/${segmented_data_set}_hires

  utils/data/get_uniform_subsegments.py \
    --max-segment-duration=$window \
    --overlap-duration=$overlap \
    --max-remaining-duration=$(perl -e "print $window/ 2.0") \
    data/${data_set}_hires/segments > data/${segmented_data_set}_hires/sub_segments

  utils/data/subsegment_data_dir.sh data/${data_set}_hires \
    data/${segmented_data_set}_hires/sub_segments data/${segmented_data_set}_hires
  steps/compute_cmvn_stats.sh data/${segmented_data_set}_hires

  utils/fix_data_dir.sh data/${segmented_data_set}_hires
  utils/validate_data_dir.sh --no-text data/${segmented_data_set}_hires
fi