Blame view
egs/sprakbanken/s5/local/run_sgmm2.sh
7.12 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
#!/bin/bash # This script is invoked from ../run.sh # It contains some SGMM-related scripts that I am breaking out of the main run.sh for clarity. . ./cmd.sh # Note: you might want to try to give the option --spk-dep-weights=false to train_sgmm2.sh; # this takes out the "symmetric SGMM" part which is not always helpful. # SGMM system on si84 data [sgmm5a]. Note: the system we aligned from used the si284 data for # training, but this shouldn't have much effect. ( steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ data/train_si84 data/lang exp/tri4b exp/tri4b_ali_si84 || exit 1; steps/train_ubm.sh --cmd "$train_cmd" \ 400 data/train_si84 data/lang exp/tri4b_ali_si84 exp/ubm5a || exit 1; steps/train_sgmm2.sh --cmd "$train_cmd" \ 7000 9000 data/train_si84 data/lang exp/tri4b_ali_si84 \ exp/ubm5a/final.ubm exp/sgmm2_5a || exit 1; ( utils/mkgraph.sh data/lang_test_tgpr exp/sgmm2_5a exp/sgmm2_5a/graph_tgpr steps/decode_sgmm2.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ exp/sgmm2_5a/graph_tgpr data/test_dev93 exp/sgmm2_5a/decode_tgpr_dev93 ) & steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si84 \ --use-graphs true --use-gselect true data/train_si84 data/lang exp/sgmm2_5a exp/sgmm2_5a_ali_si84 || exit 1; steps/make_denlats_sgmm2.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 \ data/train_si84 data/lang exp/sgmm2_5a_ali_si84 exp/sgmm2_5a_denlats_si84 steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \ data/train_si84 data/lang exp/sgmm2_5a_ali_si84 exp/sgmm2_5a_denlats_si84 exp/sgmm2_5a_mmi_b0.1 for iter in 1 2 3 4; do steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm2_5a/decode_tgpr_dev93 \ exp/sgmm2_5a_mmi_b0.1/decode_tgpr_dev93_it$iter & done steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \ --update-opts "--cov-min-value=0.9" data/train_si84 data/lang exp/sgmm2_5a_ali_si84 exp/sgmm2_5a_denlats_si84 exp/sgmm2_5a_mmi_b0.1_m0.9 for iter in 1 2 3 4; do steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm2_5a/decode_tgpr_dev93 \ exp/sgmm2_5a_mmi_b0.1_m0.9/decode_tgpr_dev93_it$iter & done ) & ( # The next commands are the same thing on all the si284 data. # SGMM system on the si284 data [sgmm5b] steps/train_ubm.sh --cmd "$train_cmd" \ 600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/ubm5b || exit 1; steps/train_sgmm2.sh --cmd "$train_cmd" \ 11000 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \ exp/ubm5b/final.ubm exp/sgmm2_5b || exit 1; ( utils/mkgraph.sh data/lang_test_tgpr exp/sgmm2_5b exp/sgmm2_5b/graph_tgpr steps/decode_sgmm2.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ exp/sgmm2_5b/graph_tgpr data/test_dev93 exp/sgmm2_5b/decode_tgpr_dev93 steps/decode_sgmm2.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \ exp/sgmm2_5b/graph_tgpr data/test_eval92 exp/sgmm2_5b/decode_tgpr_eval92 utils/mkgraph.sh data/lang_test_bd_tgpr exp/sgmm2_5b exp/sgmm2_5b/graph_bd_tgpr || exit 1; steps/decode_sgmm2.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \ exp/sgmm2_5b/graph_bd_tgpr data/test_dev93 exp/sgmm2_5b/decode_bd_tgpr_dev93 steps/decode_sgmm2.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \ exp/sgmm2_5b/graph_bd_tgpr data/test_eval92 exp/sgmm2_5b/decode_bd_tgpr_eval92 ) & # This shows how you would build and test a quinphone SGMM2 system, but ( steps/train_sgmm2.sh --cmd "$train_cmd" \ --context-opts "--context-width=5 --central-position=2" \ 11000 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \ exp/ubm5b/final.ubm exp/sgmm2_5c || exit 1; # Decode from lattices in exp/sgmm2_5b steps/decode_sgmm2_fromlats.sh --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ data/test_dev93 data/lang_test_tgpr exp/sgmm2_5b/decode_tgpr_dev93 exp/sgmm2_5c/decode_tgpr_dev93 steps/decode_sgmm2_fromlats.sh --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \ data/test_eval92 data/lang_test_tgpr exp/sgmm2_5b/decode_tgpr_eval92 exp/sgmm2_5c/decode_tgpr_eval92 ) & steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si284 \ --use-graphs true --use-gselect true data/train_si284 data/lang exp/sgmm2_5b exp/sgmm2_5b_ali_si284 steps/make_denlats_sgmm2.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \ data/train_si284 data/lang exp/sgmm2_5b_ali_si284 exp/sgmm2_5b_denlats_si284 steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \ data/train_si284 data/lang exp/sgmm2_5b_ali_si284 exp/sgmm2_5b_denlats_si284 exp/sgmm2_5b_mmi_b0.1 for iter in 1 2 3 4; do for test in eval92; do # dev93 steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ --transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_fg data/test_${test} exp/sgmm2_5b/decode_bd_tgpr_${test} \ exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_${test}_it$iter & done done steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \ --drop-frames true data/train_si284 data/lang exp/sgmm2_5b_ali_si284 exp/sgmm2_5b_denlats_si284 exp/sgmm2_5b_mmi_b0.1_z for iter in 1 2 3 4; do for test in eval92 dev93; do steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ --transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_fg data/test_${test} exp/sgmm2_5b/decode_bd_tgpr_${test} \ exp/sgmm2_5b_mmi_b0.1_z/decode_bd_tgpr_${test}_it$iter & done done ) & wait # Examples of combining some of the best decodings: SGMM+MMI with # MMI+fMMI on a conventional system. local/score_combine.sh data/test_eval92 \ data/lang_test_bd_tgpr \ exp/tri4b_fmmi_a/decode_tgpr_eval92_it8 \ exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3 \ exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it8_3 # %WER 4.43 [ 250 / 5643, 41 ins, 12 del, 197 sub ] exp/tri4b_fmmi_a/decode_tgpr_eval92_it8/wer_11 # %WER 3.85 [ 217 / 5643, 35 ins, 11 del, 171 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3/wer_10 # combined to: # %WER 3.76 [ 212 / 5643, 32 ins, 12 del, 168 sub ] exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it8_3/wer_12 # Checking MBR decode of baseline: rm -r exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr 2>/dev/null cp -r exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3{,.mbr} local/score_mbr.sh data/test_eval92 data/lang_test_bd_tgpr exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr # MBR decoding did not seem to help (baseline was 3.85). I think this is normal at such low WERs. %WER 3.86 [ 218 / 5643, 35 ins, 11 del, 172 sub ] exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr/wer_10 |