farsdat_norm_trans.sh 3.54 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89


#!/bin/bash

# Copyright 2014   University of Tehran (Author: Bagher BabaAli)
# Apache 2.0.

# This script normalizes the TIMIT phonetic transcripts that have been 
# extracted in a format where each line contains an utterance ID followed by 
# the transcript, e.g.:
#Normalizes phonetic transcriptions for TIMIT, by mapping the phones to a 
#smaller set defined by the -m option. This script assumes that the mapping is 
#done in the \"standard\" fashion, i.e. to 48 or 39 phones.  The input is 
#assumed to have 60 phones (+1 for glottal stop, which is deleted), but that can
#be changed using the -from option. The input format is assumed to be utterance 
#ID followed by transcript on the same line.


if [ $# -ne 1 ]; then
   echo "Argument should be a transcription file in a format where each line contains an utterance ID followed by the transcript."
   exit 1;
fi

cat $1 | awk  '{
                for(i=1; i<=NF; ++i) {
                  if ( $i == "\\" ) {
                     if ( ( i+1 == NF ) || ( $(i+1) != "p" ) ) {
                        printf("p ")
                     }                    
                  } 
                  else if ( $i == "`" ) {
                     if ( ( i+1 == NF ) || ( $(i+1) != "b" ) ) {
                        printf("b ")
                     }                    
                  } 
                  else if ( $i == "-" ) {
                     if ( ( i+1 == NF ) || ( $(i+1) != "t" ) ) {
                        printf("t ")
                     }                    
                  } 
                  else if ( $i == "=" ) {
                     if ( ( i+1 == NF ) || ( $(i+1) != "d" ) ) {
                        printf("d ")
                     }                    
                  } 
                  else if ( $i == "@" ) {
                     if ( ( i+1 == NF ) || ( $(i+1) != "c" ) ) {
                        printf("c ")
                     }                    
                  } 
                  else if ( $i == "*" ) {
                     if ( ( i+1 == NF ) || ( $(i+1) != "k" ) ) {
                        printf("k ")
                     }                    
                  } 
                  else if ( $i == "!" ) {
                     if ( ( i+1 == NF ) || ( $(i+1) != ";" ) ) {
                        printf("; ")
                     }                    
                  } 
                  else if ( $i == "&" ) {
                     if ( ( i+1 == NF ) || ( $(i+1) != "g" ) ) {
                        printf("g ")
                     }                    
                  } 
                  else if ( $i == "^" ) {
                     if ( ( i+1 == NF ) || ( $(i+1) != "q" ) ) {
                        printf("q ")
                     }                    
                  } 
                  else if ( $i == "#" ) {
                     if ( ( i+1 == NF ) || ( $(i+1) != "," ) ) {
                        printf(", ")
                     }                    
                  } 
                  else if ( $i == "$" ) {
                     if ( ( i+1 == NF ) || ( $(i+1) != "'\''" ) ) {
                        printf("'\'' ")
                     }                    
                  } 
                  else if ( $i == "(" ) {
                     if ( ( i+1 == NF ) || ( $(i+1) != "]" ) ) {
                        printf("] ")
                     }                    
                  } 
                  else {
                     printf("%s ",$i)
                  }
                }
                printf("\n");
             }' | tr 'c' 'k' | tr ';' 'g' | sed -r 's/j/sil/g'