Blame view

egs/zeroth_korean/s5/local/update_segmentation.sh 975 Bytes
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
  #!/bin/bash
  
  # Copyright 2017 Lucas Jo (Atlas Guide)
  # Apache 2.0
  
  # do this when the segmentation rule is changed
  dataDir=$1
  lmDir=$2
  
  exists(){
  	command -v "$1" >/dev/null 2>&1
  }
  
  # check morfessor installation 
  if ! exists morfessor; then
  	echo "You appear to not have Morfessor installed, either on your path."
      echo "See tools/extras/install_morfessor.sh installation instructions."
  	exit 1
  fi
  
  trans=$dataDir/text
  echo "Re-segment transcripts: $trans --------------------------------------------"
  if [ ! -f $trans ]; then
  	echo "transcription file is not found in "$dataDir
  	exit 1
  fi
  cp $trans $trans".old"
  awk '{print $1}' $trans".old" > $trans"_tmp_index"
  cut -d' ' -f2- $trans".old" |\
  	sed -E 's/\s+/ /g; s/^\s//g; s/\s$//g' |\
  	morfessor -e 'utf-8' -l $lmDir/zeroth_morfessor.seg -T - -o - \
  	--output-format '{analysis} ' --output-newlines \
  	--nosplit-re '[0-9\[\]\(\){}a-zA-Z&.,\-]+' \
  	| paste -d" " $trans"_tmp_index" - > $trans
  rm -f $trans"_tmp_index"