Jean-François Rey / otmedia

1

#!/bin/bash

1

#!/bin/bash

2

3

#####################################################

3

#####################################################

4

# File : ExploitConfidencePass.sh #

4

# File : ExploitConfidencePass.sh #

5

# Brief : Exploit the ASR confidence pass to : #

5

# Brief : Exploit the ASR confidence pass to : #

6

# -> boost the confident zone #

6

# -> boost the confident zone #

7

# -> find alternative in non confident zone

7

# -> find alternative in non confident zone

8

# -> dynamicly extend the lexicon #

8

# -> dynamicly extend the lexicon #

9

# Author : Jean-François Rey #

9

# Author : Jean-François Rey #

10

# (base on Emmanuel Ferreira #

10

# (base on Emmanuel Ferreira #

11

# and Hugo Mauchrétien works) #

11

# and Hugo Mauchrétien works) #

12

# Version : 1.0 #

12

# Version : 1.0 #

13

# Date : 25/06/13 #

13

# Date : 25/06/13 #

14

#####################################################

14

#####################################################

15

16

echo "### ExploitConfidencePass.sh ###"

16

echo "### ExploitConfidencePass.sh ###"

17

18

# Check OTMEDIA_HOME env var

18

# Check OTMEDIA_HOME env var

19

if [ -z ${OTMEDIA_HOME} ]

19

if [ -z ${OTMEDIA_HOME} ]

20

then

20

then

21

OTMEDIA_HOME=$(dirname $(dirname $(readlink -e $0)))

21

OTMEDIA_HOME=$(dirname $(dirname $(readlink -e $0)))

22

export OTMEDIA_HOME=$OTMEDIA_HOME

22

export OTMEDIA_HOME=$OTMEDIA_HOME

23

fi

23

fi

24

25

# where is ExploitConfidencePass.sh

25

# where is ExploitConfidencePass.sh

26

MAIN_SCRIPT_PATH=$(dirname $(readlink -e $0))

26

MAIN_SCRIPT_PATH=$(dirname $(readlink -e $0))

27

28

if [ -z ${SCRIPT_PATH} ]

28

if [ -z ${SCRIPT_PATH} ]

29

then

29

then

30

SCRIPT_PATH=$OTMEDIA_HOME/tools/scripts

30

SCRIPT_PATH=$OTMEDIA_HOME/tools/scripts

31

fi

31

fi

32

33

# Include scripts

33

# Include scripts

34

. $SCRIPT_PATH"/Tools.sh"

34

. $SCRIPT_PATH"/Tools.sh"

35

. $SCRIPT_PATH"/CheckExploitConfPass.sh"

35

. $SCRIPT_PATH"/CheckExploitConfPass.sh"

36

37

# where is ExploitConfidencePass.cfg

37

# where is ExploitConfidencePass.cfg

38

EXPLOITCONFIDENCEPASS_CONFIG_FILE=$OTMEDIA_HOME"/cfg/ExploitConfidencePass.cfg"

38

EXPLOITCONFIDENCEPASS_CONFIG_FILE=$OTMEDIA_HOME"/cfg/ExploitConfidencePass.cfg"

39

if [ -e $EXPLOITCONFIDENCEPASS_CONFIG_FILE ]

39

if [ -e $EXPLOITCONFIDENCEPASS_CONFIG_FILE ]

40

then

40

then

41

. $EXPLOITCONFIDENCEPASS_CONFIG_FILE

41

. $EXPLOITCONFIDENCEPASS_CONFIG_FILE

42

else

42

else

43

echo "ERROR : Can't find configuration file $EXPLOITCONFIDENCEPASS_CONFIG_FILE" >&2

43

echo "ERROR : Can't find configuration file $EXPLOITCONFIDENCEPASS_CONFIG_FILE" >&2

44

exit 1

44

exit 1

45

fi

45

fi

46

47

#---------------#

47

#---------------#

48

# Parse Options #

48

# Parse Options #

49

#---------------#

49

#---------------#

50

while getopts ":hDv:cr" opt

50

while getopts ":hDv:cr" opt

51

do

51

do

52

case $opt in

52

case $opt in

53

h)

53

h)

54

echo -e "$0 [OPTIONS] <INPUT_DIRECTORY>\n"

54

echo -e "$0 [OPTIONS] <INPUT_DIRECTORY>\n"

55

echo -e "\t Options:"

55

echo -e "\t Options:"

56

echo -e "\t\t-h :\tprint this message"

56

echo -e "\t\t-h :\tprint this message"

57

echo -e "\t\t-D :\tDEBUG mode on"

57

echo -e "\t\t-D :\tDEBUG mode on"

58

echo -e "\t\t-v l :\tVerbose mode, l=(1|2|3) level mode"

58

echo -e "\t\t-v l :\tVerbose mode, l=(1|2|3) level mode"

59

echo -e "\t\t-c :\tCheck process, stop if error detected"

59

echo -e "\t\t-c :\tCheck process, stop if error detected"

60

echo -e "\t\t-r n :\tforce rerun without deleting files"

60

echo -e "\t\t-r n :\tforce rerun without deleting files"

61

exit 1

61

exit 1

62

;;

62

;;

63

D)

63

D)

64

DEBUG=1

64

DEBUG=1

65

;;

65

;;

66

v)

66

v)

67

VERBOSE=$OPTARG

67

VERBOSE=$OPTARG

68

;;

68

;;

69

c)

69

c)

70

CHECK=1

70

CHECK=1

71

;;

71

;;

72

r)

72

r)

73

RERUN=1

73

RERUN=1

74

;;

74

;;

75

:)

75

:)

76

echo "Option -$OPTARG requires an argument." >&2

76

echo "Option -$OPTARG requires an argument." >&2

77

exit 1

77

exit 1

78

;;

78

;;

79

\?)

79

\?)

80

echo "BAD USAGE : unknow opton -$OPTARG"

80

echo "BAD USAGE : unknow opton -$OPTARG"

81

#exit 1

81

#exit 1

82

;;

82

;;

83

esac

83

esac

84

done

84

done

85

86

# mode debug enable

86

# mode debug enable

87

if [ $DEBUG -eq 1 ]

87

if [ $DEBUG -eq 1 ]

88

then

88

then

89

set -x

89

set -x

90

echo -e "## Mode DEBUG ON ##"

90

echo -e "## Mode DEBUG ON ##"

91

fi

91

fi

92

93

# mode verbose enable

93

# mode verbose enable

94

if [ $VERBOSE -gt 0 ]; then echo -e "## Verbose level : $VERBOSE ##" ;fi

94

if [ $VERBOSE -gt 0 ]; then echo -e "## Verbose level : $VERBOSE ##" ;fi

95

96

# Check USAGE by arguments number

96

# Check USAGE by arguments number

97

if [ $(($#-($OPTIND-1))) -ne 1 ]

97

if [ $(($#-($OPTIND-1))) -ne 1 ]

98

then

98

then

99

echo "BAD USAGE : ExploitConfidencePass.sh [OPTIONS] <INPUT_DIRECTORY>"

99

echo "BAD USAGE : ExploitConfidencePass.sh [OPTIONS] <INPUT_DIRECTORY>"

100

echo "$0 -h for more info"

100

echo "$0 -h for more info"

101

exit 1

101

exit 1

102

fi

102

fi

103

104

shift $((OPTIND-1))

104

shift $((OPTIND-1))

105

# check input directory - first argument

105

# check input directory - first argument

106

if [ ! -e $1 ]

106

if [ ! -e $1 ]

107

then

107

then

108

print_error "can't open $1"

108

print_error "can't open $1"

109

exit 1

109

exit 1

110

fi

110

fi

111

112

print_info "[${BASENAME}] => ExploitConfPass start | $(date +'%d/%m/%y %H:%M:%S')" 1

112

print_info "[${BASENAME}] => ExploitConfPass start | $(date +'%d/%m/%y %H:%M:%S')" 1

113

114

#-------------#

114

#-------------#

115

# GLOBAL VARS #

115

# GLOBAL VARS #

116

#-------------#

116

#-------------#

117

INPUT_DIR=$(readlink -e $1)

117

INPUT_DIR=$(readlink -e $1)

118

OUTPUT_DIR=$INPUT_DIR

118

OUTPUT_DIR=$INPUT_DIR

119

BASENAME=$(basename $OUTPUT_DIR)

119

BASENAME=$(basename $OUTPUT_DIR)

120

SHOW_DIR="$OUTPUT_DIR/shows/"

120

SHOW_DIR="$OUTPUT_DIR/shows/"

121

SOLR_RES="$OUTPUT_DIR/solr/"

121

SOLR_RES="$OUTPUT_DIR/solr/"

122

EXT_LEX="$OUTPUT_DIR/LEX/"

122

EXT_LEX="$OUTPUT_DIR/LEX/"

123

TRIGGER_CONFZONE="$OUTPUT_DIR/trigg/"

123

TRIGGER_CONFZONE="$OUTPUT_DIR/trigg/"

124

LOGFILE="$OUTPUT_DIR/info_exploitconf.log"

124

LOGFILE="$OUTPUT_DIR/info_exploitconf.log"

125

ERRORFILE="$OUTPUT_DIR/error_exploitconf.log"

125

ERRORFILE="$OUTPUT_DIR/error_exploitconf.log"

126

127

CONFPASS_CONFIG_FILE="$(readlink -e $1)/ConfPass.cfg"

127

CONFPASS_CONFIG_FILE="$(readlink -e $1)/ConfPass.cfg"

128

if [ -e $CONFPASS_CONFIG_FILE ]

128

if [ -e $CONFPASS_CONFIG_FILE ]

129

then

129

then

130

{

130

{

131

RES_CONF_DIR=$(cat $CONFPASS_CONFIG_FILE | grep "^RES_CONF_DIR=" | cut -f2 -d"=")

131

RES_CONF_DIR=$(cat $CONFPASS_CONFIG_FILE | grep "^RES_CONF_DIR=" | cut -f2 -d"=")

132

RES_CONF=$(cat $CONFPASS_CONFIG_FILE | grep "^CONF_DIR=" | cut -f2 -d"=")

132

RES_CONF=$(cat $CONFPASS_CONFIG_FILE | grep "^CONF_DIR=" | cut -f2 -d"=")

133

print_info "[${BASENAME}] Use confidence measure from : $RES_CONF" 2

133

print_info "[${BASENAME}] Use confidence measure from : $RES_CONF" 2

134

}

134

}

135

else

135

else

136

{

136

{

137

print_error "[${BASENAME}] Can't find $CONFPASS_CONFIG_FILE"

137

print_error "[${BASENAME}] Can't find $CONFPASS_CONFIG_FILE"

138

print_error "[${BASENAME}] -> use res_p2"

138

print_error "[${BASENAME}] -> use res_p2"

139

RES_CONF_DIR="$INPUT_DIR/conf/res_p2/scored_ctm"

139

RES_CONF_DIR="$INPUT_DIR/conf/res_p2/scored_ctm"

140

RES_CONF="$INPUT_DIR/conf/res_p2"

140

RES_CONF="$INPUT_DIR/conf/res_p2"

141

}

141

}

142

fi

142

fi

143

144

mkdir -p $SHOW_DIR > /dev/null 2>&1

144

mkdir -p $SHOW_DIR > /dev/null 2>&1

145

mkdir -p $SOLR_RES > /dev/null 2>&1

145

mkdir -p $SOLR_RES > /dev/null 2>&1

146

mkdir -p $EXT_LEX > /dev/null 2>&1

146

mkdir -p $EXT_LEX > /dev/null 2>&1

147

mkdir -p $TRIGGER_CONFZONE > /dev/null 2>&1

147

mkdir -p $TRIGGER_CONFZONE > /dev/null 2>&1

148

149

#------------------#

149

#------------------#

150

# Create Workspace #

150

# Create Workspace #

151

#------------------#

151

#------------------#

152

# Lock directory

152

# Lock directory

153

if [ -e "$OUTPUT_DIR_BASENAME/EXPLOITCONFPASS.lock" ] && [ $RERUN -eq 0 ]

153

if [ -e "$OUTPUT_DIR_BASENAME/EXPLOITCONFPASS.lock" ] && [ $RERUN -eq 0 ]

154

then

154

then

155

print_warn "[${BASENAME}] ExploitConfidencePass is locked -> exit" 2

155

print_warn "[${BASENAME}] ExploitConfidencePass is locked -> exit" 2

156

exit 1

156

exit 1

157

fi

157

fi

158

rm "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" > /dev/null 2>&1

158

rm "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" > /dev/null 2>&1

159

touch "$OUTPUT_DIR/EXPLOITCONFPASS.lock" > /dev/null 2>&1

159

touch "$OUTPUT_DIR/EXPLOITCONFPASS.lock" > /dev/null 2>&1

160

161

rm $LOGFILE $ERRORFILE 2>/dev/null

162

161

#------#

163

#------#

162

# Save #

164

# Save #

163

#------#

165

#------#

164

cp $EXPLOITCONFIDENCEPASS_CONFIG_FILE $OUTPUT_DIR/ExploitConfPass.cfg

166

cp $EXPLOITCONFIDENCEPASS_CONFIG_FILE $OUTPUT_DIR/ExploitConfPass.cfg

165

echo "TRIGGER_DIR=$TRIGGER_CONFZONE" >> $OUTPUT_DIR/ExploitConfPass.cfg

167

echo "TRIGGER_DIR=$TRIGGER_CONFZONE" >> $OUTPUT_DIR/ExploitConfPass.cfg

166

echo "TRIGGER_SPEERAL=$TRIGGER_CONFZONE/speeral/" >> $OUTPUT_DIR/ExploitConfPass.cfg

168

echo "TRIGGER_SPEERAL=$TRIGGER_CONFZONE/speeral/" >> $OUTPUT_DIR/ExploitConfPass.cfg

167

echo "LEX_SPEERAL=$EXT_LEX/speeral/${lexname}_ext" >> $OUTPUT_DIR/ExploitConfPass.cfg

169

echo "LEX_SPEERAL=$EXT_LEX/speeral/${lexname}_ext" >> $OUTPUT_DIR/ExploitConfPass.cfg

168

echo "LEX_BINODE_SPEERAL=$EXT_LEX/speeral/${lexname}_ext.bin" >> $OUTPUT_DIR/ExploitConfPass.cfg

170

echo "LEX_BINODE_SPEERAL=$EXT_LEX/speeral/${lexname}_ext.bin" >> $OUTPUT_DIR/ExploitConfPass.cfg

169

print_info "[${BASENAME}] Save config in $OUTPUT_DIR_BASENAME/ExploitConfPass.cfg" 1

171

print_info "[${BASENAME}] Save config in $OUTPUT_DIR_BASENAME/ExploitConfPass.cfg" 1

170

172

171

#---------------#

173

#---------------#

172

# Check Pass #

174

# Check Pass #

173

#---------------#

175

#---------------#

174

if [ $( ls ${RES_CONF_DIR}/*.res 2> /dev/null | wc -l) -eq 0 ]

176

if [ $( ls ${RES_CONF_DIR}/*.res 2> /dev/null | wc -l) -eq 0 ]

175

then

177

then

176

print_error "[${BASENAME}] No Conf Pass res -> exit ExploitConfPass"

178

print_error "[${BASENAME}] No Conf Pass res -> exit ExploitConfPass"

177

if [ $CHECK -eq 1 ]; then print_log_file $ERRORFILE "No ConfPass res in ${RES_CONF_DIR}" ;fi

179

if [ $CHECK -eq 1 ]; then print_log_file $ERRORFILE "No ConfPass res in ${RES_CONF_DIR}" ;fi

178

exit 1

180

exit 1

179

fi

181

fi

180

182

181

#-----------------------#

183

#-----------------------#

182

# Segmentation by show #

184

# Segmentation by show #

183

#-----------------------#

185

#-----------------------#

184

# create txt file from scored res

186

# create txt file from scored res

185

# tag pos and lemmatization of the txt file

187

# tag pos and lemmatization of the txt file

186

# merge the scored res and taglem file

188

# merge the scored res and taglem file

187

# segment using the last generated file

189

# segment using the last generated file

188

# and create a ctm file by show

190

# and create a ctm file by show

189

191

190

print_info "[${BASENAME}] Segmentation by show" 1

192

print_info "[${BASENAME}] Segmentation by show" 1

191

193

192

# -> to txt

194

# -> to txt

193

print_info "[${BASENAME}] Create txt from scored res" 3

195

print_info "[${BASENAME}] Create txt from scored res" 3

194

cat ${RES_CONF_DIR}/*.res > $INPUT_DIR/$BASENAME.sctm

196

cat ${RES_CONF_DIR}/*.res > $INPUT_DIR/$BASENAME.sctm

195

cat $INPUT_DIR/$BASENAME.seg | $SIGMUND_BIN/myConvert.pl $INPUT_DIR/$BASENAME.sctm $INPUT_DIR/$BASENAME.tmp

197

cat $INPUT_DIR/$BASENAME.seg | $SIGMUND_BIN/myConvert.pl $INPUT_DIR/$BASENAME.sctm $INPUT_DIR/$BASENAME.tmp

196

cat $INPUT_DIR/$BASENAME.tmp | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | sed -e "s/_/ /g" | sort -nt 'n' -k '2' > $INPUT_DIR/$BASENAME.txt

198

cat $INPUT_DIR/$BASENAME.tmp | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | sed -e "s/_/ /g" | sort -nt 'n' -k '2' > $INPUT_DIR/$BASENAME.txt

197

199

198

# -> to tagger + lemme

200

# -> to tagger + lemme

199

print_info "[${BASENAME}] Tag pos and lem in txt file" 3

201

print_info "[${BASENAME}] Tag pos and lem in txt file" 3

200

iconv -t ISO_8859-1 $INPUT_DIR/$BASENAME.txt > $INPUT_DIR/$BASENAME.tmp

202

iconv -t ISO_8859-1 $INPUT_DIR/$BASENAME.txt > $INPUT_DIR/$BASENAME.tmp

201

$SIGMUND_BIN/txt2lem.sh $INPUT_DIR/$BASENAME.tmp $INPUT_DIR/$BASENAME.taglem

203

$SIGMUND_BIN/txt2lem.sh $INPUT_DIR/$BASENAME.tmp $INPUT_DIR/$BASENAME.taglem

202

204

203

# merge sctm and taglem

205

# merge sctm and taglem

204

print_info "[${BASENAME}] Merge scored ctm with tag pos and lem file" 3

206

print_info "[${BASENAME}] Merge scored ctm with tag pos and lem file" 3

205

cat $INPUT_DIR/$BASENAME.sctm | $SCRIPT_PATH/BdlexUC.pl ${RULES}/basic -f | iconv -t ISO_8859-1 | $SCRIPT_PATH/scoredCtmAndTaggedLem2All.pl $INPUT_DIR/$BASENAME.taglem > $INPUT_DIR/$BASENAME.ctl

207

cat $INPUT_DIR/$BASENAME.sctm | $SCRIPT_PATH/BdlexUC.pl ${RULES}/basic -f | iconv -t ISO_8859-1 | $SCRIPT_PATH/scoredCtmAndTaggedLem2All.pl $INPUT_DIR/$BASENAME.taglem > $INPUT_DIR/$BASENAME.ctl

206

208

207

# -> new seg

209

# -> new seg

208

print_info "[${BASENAME}] Create xml file and run Topic Seg" 3

210

print_info "[${BASENAME}] Create xml file and run Topic Seg" 3

209

$SIGMUND_BIN/tagLem2xml.pl $INPUT_DIR/$BASENAME.taglem $INPUT_DIR/$BASENAME.doc.xml

211

$SIGMUND_BIN/tagLem2xml.pl $INPUT_DIR/$BASENAME.taglem $INPUT_DIR/$BASENAME.doc.xml

210

rm $INPUT_DIR/$BASENAME.tmp #$INPUT_DIR/$BASENAME.taglem

212

rm $INPUT_DIR/$BASENAME.tmp #$INPUT_DIR/$BASENAME.taglem

211

213

212

# Lia_topic_seg : bring together sentences into show

214

# Lia_topic_seg : bring together sentences into show

213

cp $INPUT_DIR/$BASENAME.doc.xml 0.xml

215

cp $INPUT_DIR/$BASENAME.doc.xml 0.xml

214

java -cp $LIATOPICSEG/bin Test > $INPUT_DIR/show.seg

216

java -cp $LIATOPICSEG/bin Test > $INPUT_DIR/show.seg

215

cat $INPUT_DIR/show.seg | $SIGMUND_BIN/toSegEmiss.pl $INPUT_DIR/$BASENAME.show.seg

217

cat $INPUT_DIR/show.seg | $SIGMUND_BIN/toSegEmiss.pl $INPUT_DIR/$BASENAME.show.seg

216

rm 0.xml $INPUT_DIR/show.seg

218

rm 0.xml $INPUT_DIR/show.seg

217

219

218

if [ $CHECK -eq 1 ]

220

if [ $CHECK -eq 1 ]

219

then

221

then

220

if [ ! -s $INPUT_DIR/$BASENAME.show.seg ]

222

if [ ! -s $INPUT_DIR/$BASENAME.show.seg ]

221

then

223

then

222

print_error "[${BASENAME}] No Topic segmentation ! "

224

print_error "[${BASENAME}] No Topic segmentation ! "

223

print_error "[${BASENAME}] Check $ERRORFILE "

225

print_error "[${BASENAME}] Check $ERRORFILE "

224

print_log_file "$ERRORFILE" "No Topic segmentation in ${BASENAME}.show.seg"

226

print_log_file "$ERRORFILE" "No Topic segmentation in ${BASENAME}.show.seg"

225

fi

227

fi

226

fi

228

fi

227

229

228

# Segment ctm into several show files and create a seg list by show

230

# Segment ctm into several show files and create a seg list by show

229

print_info "[${BASENAME}] Segment ctm into show files and a seg list by show" 1

231

print_info "[${BASENAME}] Segment ctm into show files and a seg list by show" 1

230

$SCRIPT_PATH/ctm2show.pl $INPUT_DIR/$BASENAME.ctl $INPUT_DIR/$BASENAME.show.seg $SHOW_DIR

232

$SCRIPT_PATH/ctm2show.pl $INPUT_DIR/$BASENAME.ctl $INPUT_DIR/$BASENAME.show.seg $SHOW_DIR

231

233

232

#-----------------------------------------------------------#

234

#-----------------------------------------------------------#

233

# SOLR QUERIES #

235

# SOLR QUERIES #

234

# -> Create Confidente Word #

236

# -> Create Confidente Word #

235

# Keep conf words and use Tags #

237

# Keep conf words and use Tags #

236

# -> Query SOLR (document & multimedia) #

238

# -> Query SOLR (document & multimedia) #

237

# concat word + add date 2 day before and after the show #

239

# concat word + add date 2 day before and after the show #

238

# query document & multimedia #

240

# query document & multimedia #

239

#-----------------------------------------------------------#

241

#-----------------------------------------------------------#

240

print_info "[${BASENAME}] Create SOLR queries and ask SOLR" 1

242

print_info "[${BASENAME}] Create SOLR queries and ask SOLR" 1

241

for show in $(ls $SHOW_DIR/*.ctm)

243

for show in $(ls $SHOW_DIR/*.ctm)

242

do

244

do

243

bn=$(basename $show .ctm)

245

bn=$(basename $show .ctm)

244

# Remove words with low confidence and keep useful tagger words

246

# Remove words with low confidence and keep useful tagger words

245

cat $show | $SCRIPT_PATH/KeepConfZone.pl | grep -e "MOTINC\|NMS\|NMP\|NFS\|NFP\|X[A-Z]{3,5}" | cut -f3 -d' ' > "$SHOW_DIR/$bn.confzone"

247

cat $show | $SCRIPT_PATH/KeepConfZone.pl | grep -e "MOTINC\|NMS\|NMP\|NFS\|NFP\|X[A-Z]{3,5}" | cut -f3 -d' ' > "$SHOW_DIR/$bn.confzone"

246

# Get date 2 day before and after the show

248

# Get date 2 day before and after the show

247

datePattern=`$SCRIPT_PATH/daybefore2after.sh $(echo $BASENAME | cut -c1-6)`

249

datePattern=`$SCRIPT_PATH/daybefore2after.sh $(echo $BASENAME | cut -c1-6)`

248

# Create SOLR queries

250

# Create SOLR queries

249

cat $SHOW_DIR/$bn".confzone" | $SCRIPT_PATH/GenerateSOLRQueries.pl | iconv -f ISO_8859-1 -t UTF-8 > "$SHOW_DIR/$bn.queries"

251

cat $SHOW_DIR/$bn".confzone" | $SCRIPT_PATH/GenerateSOLRQueries.pl | iconv -f ISO_8859-1 -t UTF-8 > "$SHOW_DIR/$bn.queries"

250

# Ask SOLR DB

252

# Ask SOLR DB

251

if [ $(wc -w "$SHOW_DIR/$bn.queries" | cut -f1 -d' ') -gt 0 ]; then

253

if [ $(wc -w "$SHOW_DIR/$bn.queries" | cut -f1 -d' ') -gt 0 ]; then

252

query=$(cat $SHOW_DIR/$bn.queries)"&fq=docDate:[$datePattern]"

254

query=$(cat $SHOW_DIR/$bn.queries)"&fq=docDate:[$datePattern]"

253

echo $query > $SHOW_DIR/$bn.queries

255

echo $query > $SHOW_DIR/$bn.queries

254

print_info "python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp" 3

256

print_info "python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp" 3

255

python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp

257

python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp

256

cat $SOLR_RES/$bn.keywords.tmp | sort -u > $SOLR_RES/$bn.keywords

258

cat $SOLR_RES/$bn.keywords.tmp | sort -u > $SOLR_RES/$bn.keywords

257

cat $SOLR_RES/$bn.txt.tmp | sort -u > $SOLR_RES/$bn.txt

259

cat $SOLR_RES/$bn.txt.tmp | sort -u > $SOLR_RES/$bn.txt

258

rm $SOLR_RES/*.tmp > /dev/null 2>&1

260

rm $SOLR_RES/*.tmp > /dev/null 2>&1

259

fi

261

fi

260

262

261

if [ $CHECK -eq 1 ]

263

if [ $CHECK -eq 1 ]

262

then

264

then

263

if [ ! -e $SOLR_RES/$bn.keywords ] || [ ! -e $SOLR_RES/$bn.txt ]

265

if [ ! -e $SOLR_RES/$bn.keywords ] || [ ! -e $SOLR_RES/$bn.txt ]

264

then

266

then

265

print_warn "$bn.keywords and $bn.txt are empty !\nMaybe SOLR server is down !" 2

267

print_warn "$bn.keywords and $bn.txt are empty !\nMaybe SOLR server is down !" 2

266

print_log_file "$LOGFILE" "$bn.keywords and $bn.txt are empty !\nMaybe SOLR server is down !"

268

print_log_file "$LOGFILE" "$bn.keywords and $bn.txt are empty !\nMaybe SOLR server is down !"

267

fi

269

fi

268

fi

270

fi

269

271

270

done

272

done

271

273

272

#-----------------------------------------------------------------------------------------------

274

#-----------------------------------------------------------------------------------------------

273

# Build trigger file

275

# Build trigger file

274

# 1) keywords are automatically boosted in the non confident zone of the current res

276

# 1) keywords are automatically boosted in the non confident zone of the current res

275

# confident zone are boosted

277

# confident zone are boosted

276

# previous words in sensible zone are penalized

278

# previous words in sensible zone are penalized

277

# 2) OOVs are extracted + phonetized

279

# 2) OOVs are extracted + phonetized

278

# 3) Try to find OOVs acousticly in the current segment

280

# 3) Try to find OOVs acousticly in the current segment

279

# 4) Generate the .trigg file

281

# 4) Generate the .trigg file

280

#------------------------------------------------------------------------------------------------

282

#------------------------------------------------------------------------------------------------

281

print_info "[${BASENAME}] Build trigger files" 1

283

print_info "[${BASENAME}] Build trigger files" 1

282

for i in `ls $SOLR_RES/*.keywords`

284

for i in `ls $SOLR_RES/*.keywords`

283

do

285

do

284

basename=`basename $i .keywords`

286

basename=`basename $i .keywords`

285

287

286

#

288

#

287

# Tokenize & produce coverage report

289

# Tokenize & produce coverage report

288

# Use filter you need

290

# Use filter you need

289

#

291

#

290

print_info "[${BASENAME}] keywords filtering and produce coverage report" 3

292

print_info "[${BASENAME}] keywords filtering and produce coverage report" 3

291

# Default filter

293

# Default filter

292

cat $i | $SCRIPT_PATH/CleanFilter.sh | ${SCRIPT_PATH}/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t |\

294

cat $i | $SCRIPT_PATH/CleanFilter.sh | ${SCRIPT_PATH}/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t |\

293

$SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok

295

$SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok

294

# do less filter

296

# do less filter

295

#cat $i | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex | $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok

297

#cat $i | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex | $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok

296

298

297

299

298

#

300

#

299

# Extract "real" OOV and phonetize them

301

# Extract "real" OOV and phonetize them

300

# -> petit filtrage persoo pour eviter d'avoir trop de bruits

302

# -> petit filtrage persoo pour eviter d'avoir trop de bruits

301

#

303

#

302

print_info "[${BASENAME}] Extract OOV and phonetize them" 3

304

print_info "[${BASENAME}] Extract OOV and phonetize them" 3

303

${SCRIPT_PATH}/FindNormRules.pl $SOLR_RES/${basename}_tmp_report/report.oov $LEXICON.bdlex_tok | cut -f3 | grep -v "#" | grep -v "^[A-Z]\+$" | grep -v "^[0-9]" | grep --perl-regex -v "^([a-z']){1,3}$" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | iconv -t ISO_8859-1 -f UTF-8 | ${LIA_LTBOX}/lia_phon/script/lia_lex2phon_variante | grep -v "core dumped" | cut -d"[" -f1 | sort -u | ${SCRIPT_PATH}/PhonFormatter.pl | iconv -f ISO_8859-1 -t UTF-8 | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $SOLR_RES/${basename}.phon_oov

305

${SCRIPT_PATH}/FindNormRules.pl $SOLR_RES/${basename}_tmp_report/report.oov $LEXICON.bdlex_tok | cut -f3 | grep -v "#" | grep -v "^[A-Z]\+$" | grep -v "^[0-9]" | grep --perl-regex -v "^([a-z']){1,3}$" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | iconv -t ISO_8859-1 -f UTF-8 | ${LIA_LTBOX}/lia_phon/script/lia_lex2phon_variante | grep -v "core dumped" | cut -d"[" -f1 | sort -u | ${SCRIPT_PATH}/PhonFormatter.pl | iconv -f ISO_8859-1 -t UTF-8 | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $SOLR_RES/${basename}.phon_oov

304

306

305

#

307

#

306

# Search INVOC & OOV in the current lattice

308

# Search INVOC & OOV in the current lattice

307

#

309

#

308

print_info "[${BASENAME}] Search INVOC and OOV in the current lattice" 3

310

print_info "[${BASENAME}] Search INVOC and OOV in the current lattice" 3

309

cat $SOLR_RES/${basename}_tmp_report/report.invoc | grep -v "\b0" | cut -f1 | grep -v --perl-regex -v "^[a-zA-Z']{1,3}$" | grep -v --perl-regex "^[a-zA-Z0-9]{1,3}$" | grep -v "<s>" | grep -v "</s>" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $TRIGGER_CONFZONE/$basename.tosearch

311

cat $SOLR_RES/${basename}_tmp_report/report.invoc | grep -v "\b0" | cut -f1 | grep -v --perl-regex -v "^[a-zA-Z']{1,3}$" | grep -v --perl-regex "^[a-zA-Z0-9]{1,3}$" | grep -v "<s>" | grep -v "</s>" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $TRIGGER_CONFZONE/$basename.tosearch

310

cat $SOLR_RES/${basename}.phon_oov | cut -f1 >> $TRIGGER_CONFZONE/$basename.tosearch

312

cat $SOLR_RES/${basename}.phon_oov | cut -f1 >> $TRIGGER_CONFZONE/$basename.tosearch

311

313

312

# For each treil

314

# For each treil

313

for baseseg in $(cat "$SHOW_DIR/$basename.lst")

315

for baseseg in $(cat "$SHOW_DIR/$basename.lst")

314

do

316

do

315

$OTMEDIA_HOME/tools/QUOTE_FINDER/bin/acousticFinder ${LEXICON}.speer_phon $RES_CONF/wlat/$baseseg.wlat $TRIGGER_CONFZONE/${basename}.tosearch $SOLR_RES/$basename.phon_oov > $TRIGGER_CONFZONE/$baseseg.acousticlyfound $OUTPUT_REDIRECTION

317

$OTMEDIA_HOME/tools/QUOTE_FINDER/bin/acousticFinder ${LEXICON}.speer_phon $RES_CONF/wlat/$baseseg.wlat $TRIGGER_CONFZONE/${basename}.tosearch $SOLR_RES/$basename.phon_oov > $TRIGGER_CONFZONE/$baseseg.acousticlyfound $OUTPUT_REDIRECTION

316

#

318

#

317

# Produce the boost file for the next decoding pass

319

# Produce the boost file for the next decoding pass

318

#

320

#

319

print_info "[${BASENAME}] Produce trigg file : $baseseg " 3

321

print_info "[${BASENAME}] Produce trigg file : $baseseg " 3

320

cat $RES_CONF_DIR/$baseseg.res | $SCRIPT_PATH/ScoreCtm2trigg.pl $TRIGGER_CONFZONE/$baseseg.acousticlyfound > $TRIGGER_CONFZONE/$baseseg.trigg

322

cat $RES_CONF_DIR/$baseseg.res | $SCRIPT_PATH/ScoreCtm2trigg.pl $TRIGGER_CONFZONE/$baseseg.acousticlyfound > $TRIGGER_CONFZONE/$baseseg.trigg

321

done

323

done

322

324

323

done

325

done

324

326

325

#-----------------------------------------------------------------------------------------------

327

#-----------------------------------------------------------------------------------------------

326

# Build the extended SPEERAL Lexicon

328

# Build the extended SPEERAL Lexicon

327

# 1) Merge OOVs + LEXICON

329

# 1) Merge OOVs + LEXICON

328

# 1) Related text are collected in order to find the invoc word with maximizing the ppl (LM proba)

330

# 1) Related text are collected in order to find the invoc word with maximizing the ppl (LM proba)

329

# 2) The current lexicon is extended with all the valid OOVs

331

# 2) The current lexicon is extended with all the valid OOVs

330

#-----------------------------------------------------------------------------------------------

332

#-----------------------------------------------------------------------------------------------

331

print_info "[${BASENAME}] Build extended Speeral Lexicon" 1

333

print_info "[${BASENAME}] Build extended Speeral Lexicon" 1

332

mkdir -p $EXT_LEX/final

334

mkdir -p $EXT_LEX/final

333

mkdir -p $EXT_LEX/tmp

335

mkdir -p $EXT_LEX/tmp

334

mkdir -p $EXT_LEX/tmp/txt

336

mkdir -p $EXT_LEX/tmp/txt

335

#

337

#

336

# Collect the acousticly found oov and their phonetisation

338

# Collect the acousticly found oov and their phonetisation

337

#

339

#

338

print_info "[${BASENAME}] Get all OOV and retrieve all phonetisation" 3

340

print_info "[${BASENAME}] Get all OOV and retrieve all phonetisation" 3

339

for i in `ls $SOLR_RES/*.phon_oov`

341

for i in `ls $SOLR_RES/*.phon_oov`

340

do

342

do

341

basename=`basename $i .phon_oov`

343

basename=`basename $i .phon_oov`

342

344

343

rm $EXT_LEX/$basename.acousticlyfound 2> /dev/null

345

rm $EXT_LEX/$basename.acousticlyfound 2> /dev/null

344

# list acousticly found for the show

346

# list acousticly found for the show

345

for baseseg in $(cat "$SHOW_DIR/$basename.lst")

347

for baseseg in $(cat "$SHOW_DIR/$basename.lst")

346

do

348

do

347

cat $TRIGGER_CONFZONE/$baseseg.acousticlyfound | cut -f1 | cut -f2 -d"=" >> $EXT_LEX/$basename.acousticlyfound

349

cat $TRIGGER_CONFZONE/$baseseg.acousticlyfound | cut -f1 | cut -f2 -d"=" >> $EXT_LEX/$basename.acousticlyfound

348

done

350

done

349

cat $EXT_LEX/$basename.acousticlyfound | sort -u > $EXT_LEX/.tmp

351

cat $EXT_LEX/$basename.acousticlyfound | sort -u > $EXT_LEX/.tmp

350

mv $EXT_LEX/.tmp $EXT_LEX/$basename.acousticlyfound

352

mv $EXT_LEX/.tmp $EXT_LEX/$basename.acousticlyfound

351

353

352

#

354

#

353

# Extract OOV really added

355

# Extract OOV really added

354

#

356

#

355

cat $SOLR_RES/$basename.phon_oov | cut -f1 | sort -u > $EXT_LEX/$basename.oov

357

cat $SOLR_RES/$basename.phon_oov | cut -f1 | sort -u > $EXT_LEX/$basename.oov

356

$SCRIPT_PATH/intersec.pl $EXT_LEX/$basename.oov $EXT_LEX/$basename.acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound

358

$SCRIPT_PATH/intersec.pl $EXT_LEX/$basename.oov $EXT_LEX/$basename.acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound

357

#

359

#

358

# Retrieve all phonetisation

360

# Retrieve all phonetisation

359

#

361

#

360

cat $SOLR_RES/${basename}.phon_oov | $SCRIPT_PATH/LexPhonFilter.pl $EXT_LEX/$basename.oov_acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound_phon

362

cat $SOLR_RES/${basename}.phon_oov | $SCRIPT_PATH/LexPhonFilter.pl $EXT_LEX/$basename.oov_acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound_phon

361

done

363

done

362

364

363

#

365

#

364

# Merge OOVs and their phonetisation

366

# Merge OOVs and their phonetisation

365

#

367

#

366

print_info "[${BASENAME}] Merge OOV and their phonetisation" 3

368

print_info "[${BASENAME}] Merge OOV and their phonetisation" 3

367

lexname=$(basename $LEXICON)

369

lexname=$(basename $LEXICON)

368

cat $EXT_LEX/*.oov_acousticlyfound_phon | sort -u > $EXT_LEX/final/all.oov_acousticlyfound_phon

370

cat $EXT_LEX/*.oov_acousticlyfound_phon | sort -u > $EXT_LEX/final/all.oov_acousticlyfound_phon

369

cat $EXT_LEX/*.oov_acousticlyfound | sort -u | grep --perl-regex -v "^([a-z']){3}$" > $EXT_LEX/final/all.oov_acousticlyfound

371

cat $EXT_LEX/*.oov_acousticlyfound | sort -u | grep --perl-regex -v "^([a-z']){3}$" > $EXT_LEX/final/all.oov_acousticlyfound

370

$SCRIPT_PATH/MergeLexicon.pl $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/final/${lexname}_ext.phon

372

$SCRIPT_PATH/MergeLexicon.pl $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/final/${lexname}_ext.phon

371

373

372

#

374

#

373

# Collect + clean retrieved txt

375

# Collect + clean retrieved txt

374

#

376

#

375

print_info "[${BASENAME}] Collect and clean SOLR txt answers" 2

377

print_info "[${BASENAME}] Collect and clean SOLR txt answers" 2

376

# choose filter

378

# choose filter

377

# default

379

# default

378

cat $SOLR_RES/*.txt | $SCRIPT_PATH/CleanFilter.sh | $SCRIPT_PATH/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $EXT_LEX/final/all.bdlex_txt

380

cat $SOLR_RES/*.txt | $SCRIPT_PATH/CleanFilter.sh | $SCRIPT_PATH/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $EXT_LEX/final/all.bdlex_txt

379

# low filter

381

# low filter

380

#cat $SOLR_RES/*.txt | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex > $EXT_LEX/final/all.bdlex_txt

382

#cat $SOLR_RES/*.txt | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex > $EXT_LEX/final/all.bdlex_txt

381

383

382

#

384

#

383

# Construct the map file

385

# Construct the map file

384

#

386

#

385

# Notes:

387

# Notes:

386

# - Expected format :

388

# - Expected format :

387

# <WORD1_STRING> <CANDIDATE1_STRING> <PHON_1>

389

# <WORD1_STRING> <CANDIDATE1_STRING> <PHON_1>

388

#

390

#

389

print_info "[${BASENAME}] Construct map file" 3

391

print_info "[${BASENAME}] Construct map file" 3

390

rm -f $EXT_LEX/final/${lexname}_ext.map 2>/dev/null

392

rm -f $EXT_LEX/final/${lexname}_ext.map 2>/dev/null

391

rm -f $EXT_LEX/final/${lexname}.unvalid_oov 2>/dev/null

393

rm -f $EXT_LEX/final/${lexname}.unvalid_oov 2>/dev/null

392

394

393

while read oov

395

while read oov

394

do

396

do

395

oov=`echo $oov | sed "s/\n//g"`

397

oov=`echo $oov | sed "s/\n//g"`

396

#

398

#

397

# Obtain the oov's tag

399

# Obtain the oov's tag

398

#

400

#

399

#oov_tag=`grep --perl-regex "^$oov\t" $DYNAMIC_TAGSTATS/all.tags | cut -f2`

401

#oov_tag=`grep --perl-regex "^$oov\t" $DYNAMIC_TAGSTATS/all.tags | cut -f2`

400

#

402

#

401

# Try to collect text containing the oov word

403

# Try to collect text containing the oov word

402

#

404

#

403

print_info "[${BASENAME}] Collect text containing the oov" 3

405

print_info "[${BASENAME}] Collect text containing the oov" 3

404

cat $EXT_LEX/final/all.bdlex_txt | grep --perl-regex " $oov " | $SCRIPT_PATH/NbMaxWordsFilter.pl 40 |uniq > $EXT_LEX/tmp/txt/$oov.bdlex_txt

406

cat $EXT_LEX/final/all.bdlex_txt | grep --perl-regex " $oov " | $SCRIPT_PATH/NbMaxWordsFilter.pl 40 |uniq > $EXT_LEX/tmp/txt/$oov.bdlex_txt

405

if [ -f $EXT_LEX/tmp/txt/$oov.bdlex_txt ]; then

407

if [ -f $EXT_LEX/tmp/txt/$oov.bdlex_txt ]; then

406

nbWords=`wc -l $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "`

408

nbWords=`wc -l $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "`

407

if [ $nbWords -eq 0 ]; then

409

if [ $nbWords -eq 0 ]; then

408

print_warn "[${BASENAME}] UNVALID OOV: $oov => $nbWords occurrences" 2

410

print_warn "[${BASENAME}] UNVALID OOV: $oov => $nbWords occurrences" 2

409

echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov

411

echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov

410

else

412

else

411

#

413

#

412

# Find a candidate in a filtred invoc lexicon => a candidate which maximize the ppl in the overall txt collected

414

# Find a candidate in a filtred invoc lexicon => a candidate which maximize the ppl in the overall txt collected

413

#

415

#

414

#echo "$/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $LEXICON.bdlex_tok $EXT_LEX/tmp/txt/$oov.bdlex_txt"

416

#echo "$/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $LEXICON.bdlex_tok $EXT_LEX/tmp/txt/$oov.bdlex_txt"

415

print_info `$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` 3

417

print_info `$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` 3

416

candidate=`$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "`

418

candidate=`$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "`

417

if [ ! "$candidate" == "" ]; then

419

if [ ! "$candidate" == "" ]; then

418

grep --perl-regex "^$oov\t" $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/tmp/$oov.phon

420

grep --perl-regex "^$oov\t" $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/tmp/$oov.phon

419

while read phonLine

421

while read phonLine

420

do

422

do

421

#<word> <phon> => <word> <candidate> <phon>

423

#<word> <phon> => <word> <candidate> <phon>

422

echo "$phonLine" | sed "s|\t|\t$candidate\t|" >> $EXT_LEX/final/${lexname}_ext.map

424

echo "$phonLine" | sed "s|\t|\t$candidate\t|" >> $EXT_LEX/final/${lexname}_ext.map

423

done < $EXT_LEX/tmp/$oov.phon

425

done < $EXT_LEX/tmp/$oov.phon

424

else

426

else

425

print_warn "[${BASENAME}] UNVALID OOV: $oov => no availaible Candidate word in LM" 2

427

print_warn "[${BASENAME}] UNVALID OOV: $oov => no availaible Candidate word in LM" 2

426

echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov

428

echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov

427

fi

429

fi

428

fi

430

fi

429

else

431

else

430

print_warn "[${BASENAME}] UNVALID OOV: $oov" 2

432

print_warn "[${BASENAME}] UNVALID OOV: $oov" 2

431

echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov

433

echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov

432

fi

434

fi

433

done < $EXT_LEX/final/all.oov_acousticlyfound

435

done < $EXT_LEX/final/all.oov_acousticlyfound

434

436

435

#

437

#

436

### Speeral

438

### Speeral

437

#

439

#

438

440

439

lexname=`basename $LEXICON`

441

lexname=`basename $LEXICON`

440

#

442

#

441

# Build the final trigger file

443

# Build the final trigger file

442

#

444

#

443

print_info "[${BASENAME}] Clean trigg files" 3

445

print_info "[${BASENAME}] Clean trigg files" 3

444

mkdir -p $TRIGGER_CONFZONE/speeral/ 2> /dev/null

446

mkdir -p $TRIGGER_CONFZONE/speeral/ 2> /dev/null

445

mkdir -p $EXT_LEX/speeral/ 2> /dev/null

447

mkdir -p $EXT_LEX/speeral/ 2> /dev/null

446

for i in `ls $TRIGGER_CONFZONE/*.trigg`

448

for i in `ls $TRIGGER_CONFZONE/*.trigg`

447

do

449

do

448

basename=`basename $i .trigg`

450

basename=`basename $i .trigg`

449

cat $i | $SCRIPT_PATH/RemoveLineContaining.pl $EXT_LEX/$lexname.unvalid_oov > $TRIGGER_CONFZONE/speeral/$basename.trigg

451

cat $i | $SCRIPT_PATH/RemoveLineContaining.pl $EXT_LEX/$lexname.unvalid_oov > $TRIGGER_CONFZONE/speeral/$basename.trigg

450

done

452

done

451

#

453

#

452

# Compile the speeral extended lexicon

454

# Compile the speeral extended lexicon

453

#

455

#

454

print_info "[${BASENAME}] Compile Speeral extended lexicon" 3

456

print_info "[${BASENAME}] Compile Speeral extended lexicon" 3

455

print_info "$SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext" 3

457

print_info "$SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext" 3

456

$SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext

458

$SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext

457

459

458

if [ $CHECK -eq 1 ]

460

if [ $CHECK -eq 1 ]

459

then

461

then

460

check_exploitconfpass_lex_check "${EXT_LEX}/speeral/${lexname}_ext"

462

check_exploitconfpass_lex_check "${EXT_LEX}/speeral/${lexname}_ext"

461

if [ $? -eq 1 ]

463

if [ $? -eq 1 ]

462

then

464

then

463

print_error "[${BASENAME}] Building Speeral Lexicon $INPUT_DIR -> exit"

465

print_error "[${BASENAME}] Building Speeral Lexicon $INPUT_DIR -> exit"

464

print_error "[${BASENAME}] Check $ERRORFILE"

466

print_error "[${BASENAME}] Check $ERRORFILE"

465

print_log_file $ERRORFILE "ERROR : Building Speeral Lexicon $INPUT_DIR"

467

print_log_file $ERRORFILE "ERROR : Building Speeral Lexicon $INPUT_DIR"

466

print_log_file $ERRORFILE "ERROR : ${EXT_LEX}/speeral/${lexname}_ext Empty after buildmappedbinode ?"

468

print_log_file $ERRORFILE "ERROR : ${EXT_LEX}/speeral/${lexname}_ext Empty after buildmappedbinode ?"

467

exit 1;

469

exit 1;

468

fi

470

fi

469

fi

471

fi

470

472

471

473

472

#-------#

474

#-------#

473

# CLOSE #

475

# CLOSE #

474

#-------#

476

#-------#

475

# Seem OK

477

# Seem OK

476

print_info "[${BASENAME}] <= ExploitConfidencePass End | $(date +'%d/%m/%y %H:%M:%S')" 1

478

print_info "[${BASENAME}] <= ExploitConfidencePass End | $(date +'%d/%m/%y %H:%M:%S')" 1

477

479

478

# unlok directory

480

# unlok directory

479

mv "$OUTPUT_DIR/EXPLOITCONFPASS.lock" "$OUTPUT_DIR/EXPLOITCONFPASS.unlock"

481

mv "$OUTPUT_DIR/EXPLOITCONFPASS.lock" "$OUTPUT_DIR/EXPLOITCONFPASS.unlock"

480

482

481

483

482

484

GITLAB

Jean-François Rey / otmedia

up

 #!/bin/bash
 #####################################################
 # File :    ExploitConfidencePass.sh                #
 # Brief :   Exploit the ASR confidence pass to :    #
 #           -> boost the confident zone             #
 #           -> find alternative in non confident zone
 #           -> dynamicly extend the lexicon         #
 # Author :  Jean-François Rey                       #
 #	        (base on Emmanuel Ferreira              #
 #	        and Hugo Mauchrétien works)             #
 # Version : 1.0                                     #
 # Date :    25/06/13                                #
 #####################################################
 echo "### ExploitConfidencePass.sh ###"
 # Check OTMEDIA_HOME env var
 if [ -z ${OTMEDIA_HOME} ]
 then
     OTMEDIA_HOME=$(dirname $(dirname $(readlink -e $0)))
     export OTMEDIA_HOME=$OTMEDIA_HOME
 fi
 # where is ExploitConfidencePass.sh
 MAIN_SCRIPT_PATH=$(dirname $(readlink -e $0))
 if [ -z ${SCRIPT_PATH} ]
 then
     SCRIPT_PATH=$OTMEDIA_HOME/tools/scripts
 fi
 # Include scripts
 . $SCRIPT_PATH"/Tools.sh"
 . $SCRIPT_PATH"/CheckExploitConfPass.sh"
 # where is ExploitConfidencePass.cfg
 EXPLOITCONFIDENCEPASS_CONFIG_FILE=$OTMEDIA_HOME"/cfg/ExploitConfidencePass.cfg"
 if [ -e $EXPLOITCONFIDENCEPASS_CONFIG_FILE ]
 then
 	. $EXPLOITCONFIDENCEPASS_CONFIG_FILE
 else
 	echo "ERROR : Can't find configuration file $EXPLOITCONFIDENCEPASS_CONFIG_FILE" >&2
 	exit 1
 fi
 #---------------#
 # Parse Options #
 #---------------#
 while getopts ":hDv:cr" opt
 do
 	case $opt in
 		h)
 			echo -e "$0 [OPTIONS] <INPUT_DIRECTORY>\n"
             echo -e "\t Options:"
             echo -e "\t\t-h :\tprint this message"
             echo -e "\t\t-D :\tDEBUG mode on"
             echo -e "\t\t-v l :\tVerbose mode, l=(1|2|3) level mode"
             echo -e "\t\t-c :\tCheck process, stop if error detected"
             echo -e "\t\t-r n :\tforce rerun without deleting files"
 			exit 1
 			;;
 		D)
 			DEBUG=1
 			;;
         v)
             VERBOSE=$OPTARG
             ;;
         c)
             CHECK=1
             ;;
         r)
             RERUN=1
             ;;
 		:)
 			echo "Option -$OPTARG requires an argument." >&2
 			exit 1
 			;;
 		\?)
 			echo "BAD USAGE : unknow opton -$OPTARG"
 			#exit 1
 			;;
 	esac
 done
 # mode debug enable
 if [ $DEBUG -eq 1 ]
 then
        set -x
        echo -e "## Mode DEBUG ON ##"
 fi
 # mode verbose enable
 if [ $VERBOSE -gt 0 ]; then echo -e "## Verbose level : $VERBOSE ##" ;fi
 # Check USAGE by arguments number
 if [ $(($#-($OPTIND-1))) -ne 1 ]
 then
 	echo "BAD USAGE : ExploitConfidencePass.sh [OPTIONS] <INPUT_DIRECTORY>"
 	echo "$0 -h for more info"
 	exit 1
 fi
 shift $((OPTIND-1))
 # check input directory - first argument
 if [ ! -e $1 ]
 then
     print_error "can't open $1"
     exit 1
 fi
 print_info "[${BASENAME}] => ExploitConfPass start | $(date +'%d/%m/%y %H:%M:%S')" 1
 #-------------#
 # GLOBAL VARS #
 #-------------#
 INPUT_DIR=$(readlink -e $1)
 OUTPUT_DIR=$INPUT_DIR
 BASENAME=$(basename $OUTPUT_DIR)
 SHOW_DIR="$OUTPUT_DIR/shows/"
 SOLR_RES="$OUTPUT_DIR/solr/"
 EXT_LEX="$OUTPUT_DIR/LEX/"
 TRIGGER_CONFZONE="$OUTPUT_DIR/trigg/"
 LOGFILE="$OUTPUT_DIR/info_exploitconf.log"
 ERRORFILE="$OUTPUT_DIR/error_exploitconf.log"
 CONFPASS_CONFIG_FILE="$(readlink -e $1)/ConfPass.cfg"
 if [ -e $CONFPASS_CONFIG_FILE ]
 then
 {
     RES_CONF_DIR=$(cat $CONFPASS_CONFIG_FILE | grep "^RES_CONF_DIR=" | cut -f2 -d"=")
     RES_CONF=$(cat $CONFPASS_CONFIG_FILE | grep "^CONF_DIR=" | cut -f2 -d"=")
     print_info "[${BASENAME}] Use confidence measure from : $RES_CONF" 2
 }
 else
 {
     print_error "[${BASENAME}] Can't find $CONFPASS_CONFIG_FILE"
     print_error "[${BASENAME}] -> use res_p2"
     RES_CONF_DIR="$INPUT_DIR/conf/res_p2/scored_ctm"
     RES_CONF="$INPUT_DIR/conf/res_p2"
 }
 fi
 mkdir -p $SHOW_DIR > /dev/null 2>&1
 mkdir -p $SOLR_RES > /dev/null 2>&1
 mkdir -p $EXT_LEX > /dev/null 2>&1
 mkdir -p $TRIGGER_CONFZONE > /dev/null 2>&1
 #------------------#
 # Create Workspace #
 #------------------#
 # Lock directory
 if [ -e "$OUTPUT_DIR_BASENAME/EXPLOITCONFPASS.lock" ] && [ $RERUN -eq 0 ]
 then
     print_warn "[${BASENAME}] ExploitConfidencePass is locked -> exit" 2
     exit 1
 fi
 rm "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" > /dev/null 2>&1
 touch "$OUTPUT_DIR/EXPLOITCONFPASS.lock" > /dev/null 2>&1
+rm $LOGFILE $ERRORFILE 2>/dev/null
 #------#
 # Save #
 #------#
 cp $EXPLOITCONFIDENCEPASS_CONFIG_FILE $OUTPUT_DIR/ExploitConfPass.cfg
 echo "TRIGGER_DIR=$TRIGGER_CONFZONE" >> $OUTPUT_DIR/ExploitConfPass.cfg
 echo "TRIGGER_SPEERAL=$TRIGGER_CONFZONE/speeral/" >> $OUTPUT_DIR/ExploitConfPass.cfg
 echo "LEX_SPEERAL=$EXT_LEX/speeral/${lexname}_ext" >> $OUTPUT_DIR/ExploitConfPass.cfg
 echo "LEX_BINODE_SPEERAL=$EXT_LEX/speeral/${lexname}_ext.bin" >> $OUTPUT_DIR/ExploitConfPass.cfg
 print_info "[${BASENAME}] Save config in $OUTPUT_DIR_BASENAME/ExploitConfPass.cfg" 1
 #---------------#
 # Check Pass    #
 #---------------#
 if [ $( ls ${RES_CONF_DIR}/*.res 2> /dev/null | wc -l) -eq 0 ]
 then
     print_error "[${BASENAME}] No Conf Pass res -> exit ExploitConfPass"
     if [ $CHECK -eq 1 ]; then print_log_file $ERRORFILE "No ConfPass res in ${RES_CONF_DIR}" ;fi
     exit 1
 fi
 #-----------------------#
 # Segmentation by show  #
 #-----------------------#
 # create txt file from scored res
 # tag pos and lemmatization of the txt file
 # merge the scored res and taglem file
 # segment using the last generated file
 # and create a ctm file by show
 print_info "[${BASENAME}] Segmentation by show" 1
 # -> to txt
 print_info "[${BASENAME}] Create txt from scored res" 3
 cat ${RES_CONF_DIR}/*.res > $INPUT_DIR/$BASENAME.sctm
 cat $INPUT_DIR/$BASENAME.seg | $SIGMUND_BIN/myConvert.pl $INPUT_DIR/$BASENAME.sctm $INPUT_DIR/$BASENAME.tmp
 cat $INPUT_DIR/$BASENAME.tmp | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | sed -e "s/_/ /g" | sort -nt 'n' -k '2' > $INPUT_DIR/$BASENAME.txt
 # -> to tagger + lemme
 print_info "[${BASENAME}] Tag pos and lem in txt file" 3
 iconv -t ISO_8859-1 $INPUT_DIR/$BASENAME.txt > $INPUT_DIR/$BASENAME.tmp
 $SIGMUND_BIN/txt2lem.sh $INPUT_DIR/$BASENAME.tmp $INPUT_DIR/$BASENAME.taglem
 # merge sctm and taglem
 print_info "[${BASENAME}] Merge scored ctm with tag pos and lem file" 3
 cat $INPUT_DIR/$BASENAME.sctm | $SCRIPT_PATH/BdlexUC.pl ${RULES}/basic -f | iconv -t ISO_8859-1 | $SCRIPT_PATH/scoredCtmAndTaggedLem2All.pl $INPUT_DIR/$BASENAME.taglem > $INPUT_DIR/$BASENAME.ctl
 # -> new seg
 print_info "[${BASENAME}] Create xml file and run Topic Seg" 3
 $SIGMUND_BIN/tagLem2xml.pl $INPUT_DIR/$BASENAME.taglem $INPUT_DIR/$BASENAME.doc.xml
 rm $INPUT_DIR/$BASENAME.tmp #$INPUT_DIR/$BASENAME.taglem
 # Lia_topic_seg : bring together sentences into show
 cp $INPUT_DIR/$BASENAME.doc.xml 0.xml
 java -cp $LIATOPICSEG/bin Test > $INPUT_DIR/show.seg
 cat $INPUT_DIR/show.seg | $SIGMUND_BIN/toSegEmiss.pl $INPUT_DIR/$BASENAME.show.seg
 rm 0.xml $INPUT_DIR/show.seg
 if [ $CHECK -eq 1 ]
 then
     if [ ! -s $INPUT_DIR/$BASENAME.show.seg ]
     then
         print_error "[${BASENAME}] No Topic segmentation ! "
         print_error "[${BASENAME}] Check $ERRORFILE "
         print_log_file "$ERRORFILE" "No Topic segmentation in ${BASENAME}.show.seg"
     fi
 fi
 # Segment ctm into several show files and create a seg list by show
 print_info "[${BASENAME}] Segment ctm into show files and a seg list by show" 1
 $SCRIPT_PATH/ctm2show.pl $INPUT_DIR/$BASENAME.ctl $INPUT_DIR/$BASENAME.show.seg $SHOW_DIR
 #-----------------------------------------------------------#
 # SOLR QUERIES                                              #
 # -> Create Confidente Word                                 #
 #   Keep conf words and use Tags                            #
 # -> Query SOLR (document & multimedia)                     #
 #   concat word + add date 2 day before and after the show  #
 #   query document & multimedia                             #
 #-----------------------------------------------------------#
 print_info "[${BASENAME}] Create SOLR queries and ask SOLR" 1
 for show in $(ls $SHOW_DIR/*.ctm)
 do
     bn=$(basename $show .ctm)
     # Remove words with low confidence and keep useful tagger words
     cat $show | $SCRIPT_PATH/KeepConfZone.pl | grep -e "MOTINC\|NMS\|NMP\|NFS\|NFP\|X[A-Z]{3,5}" | cut -f3 -d' ' > "$SHOW_DIR/$bn.confzone"
     # Get date 2 day before and after the show
     datePattern=`$SCRIPT_PATH/daybefore2after.sh $(echo $BASENAME | cut -c1-6)`
     # Create SOLR queries
     cat $SHOW_DIR/$bn".confzone" | $SCRIPT_PATH/GenerateSOLRQueries.pl | iconv -f ISO_8859-1 -t UTF-8 > "$SHOW_DIR/$bn.queries"
     # Ask SOLR DB
     if [ $(wc -w "$SHOW_DIR/$bn.queries" | cut -f1 -d' ') -gt 0 ]; then
         query=$(cat $SHOW_DIR/$bn.queries)"&fq=docDate:[$datePattern]"
         echo $query > $SHOW_DIR/$bn.queries
         print_info "python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp" 3
         python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp
         cat $SOLR_RES/$bn.keywords.tmp | sort -u > $SOLR_RES/$bn.keywords
         cat $SOLR_RES/$bn.txt.tmp | sort -u > $SOLR_RES/$bn.txt
         rm $SOLR_RES/*.tmp > /dev/null 2>&1
     fi
     if [ $CHECK -eq 1 ]
     then
         if [ ! -e $SOLR_RES/$bn.keywords ] || [ ! -e $SOLR_RES/$bn.txt ]
         then
             print_warn "$bn.keywords and $bn.txt are empty !\nMaybe SOLR server is down !" 2
             print_log_file "$LOGFILE" "$bn.keywords and $bn.txt are empty !\nMaybe SOLR server is down !"
         fi
     fi
 done
 #-----------------------------------------------------------------------------------------------
 #  Build trigger file
 #       1) keywords are automatically boosted in the non confident zone of the current res
 #          confident zone are boosted
 #          previous words in sensible zone are penalized
 #       2) OOVs are extracted + phonetized
 #       3) Try to find OOVs acousticly in the current segment
 #       4) Generate the .trigg file
 #------------------------------------------------------------------------------------------------
 print_info "[${BASENAME}] Build trigger files" 1
 for i in `ls $SOLR_RES/*.keywords`
 do
     basename=`basename $i .keywords`
     #
     # Tokenize & produce coverage report
     # Use filter you need
     #
     print_info "[${BASENAME}] keywords filtering and produce coverage report" 3
     # Default filter
     cat $i | $SCRIPT_PATH/CleanFilter.sh | ${SCRIPT_PATH}/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t |\
         $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok
     # do less filter
     #cat $i | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex | $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok
     #
     # Extract "real" OOV and phonetize them
     # -> petit filtrage persoo pour eviter d'avoir trop de bruits
     #
     print_info "[${BASENAME}] Extract OOV and phonetize them" 3
     ${SCRIPT_PATH}/FindNormRules.pl $SOLR_RES/${basename}_tmp_report/report.oov $LEXICON.bdlex_tok | cut -f3 | grep -v "#" | grep -v "^[A-Z]\+$" | grep -v "^[0-9]" | grep --perl-regex -v "^([a-z']){1,3}$" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | iconv -t ISO_8859-1 -f UTF-8 | ${LIA_LTBOX}/lia_phon/script/lia_lex2phon_variante | grep -v "core dumped" | cut -d"[" -f1 | sort -u | ${SCRIPT_PATH}/PhonFormatter.pl | iconv -f ISO_8859-1 -t UTF-8 | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $SOLR_RES/${basename}.phon_oov
     #
     # Search INVOC & OOV in the current lattice
     #
     print_info "[${BASENAME}] Search INVOC and OOV in the current lattice" 3
     cat $SOLR_RES/${basename}_tmp_report/report.invoc | grep -v "\b0" | cut -f1 | grep -v --perl-regex -v "^[a-zA-Z']{1,3}$" | grep -v --perl-regex "^[a-zA-Z0-9]{1,3}$" | grep -v "<s>" | grep -v "</s>" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $TRIGGER_CONFZONE/$basename.tosearch
     cat $SOLR_RES/${basename}.phon_oov | cut -f1 >>  $TRIGGER_CONFZONE/$basename.tosearch
     # For each treil
     for baseseg in $(cat "$SHOW_DIR/$basename.lst")
     do
         $OTMEDIA_HOME/tools/QUOTE_FINDER/bin/acousticFinder ${LEXICON}.speer_phon $RES_CONF/wlat/$baseseg.wlat $TRIGGER_CONFZONE/${basename}.tosearch $SOLR_RES/$basename.phon_oov > $TRIGGER_CONFZONE/$baseseg.acousticlyfound $OUTPUT_REDIRECTION
         #
         # Produce the boost file for the next decoding pass
         #
         print_info "[${BASENAME}] Produce trigg file : $baseseg " 3
         cat $RES_CONF_DIR/$baseseg.res | $SCRIPT_PATH/ScoreCtm2trigg.pl $TRIGGER_CONFZONE/$baseseg.acousticlyfound > $TRIGGER_CONFZONE/$baseseg.trigg
     done
 done
 #-----------------------------------------------------------------------------------------------
 # Build the extended SPEERAL Lexicon
 #   1) Merge OOVs + LEXICON
 #   1) Related text are collected in order to find the invoc word with maximizing the ppl (LM proba)
 #   2) The current lexicon is extended with all the valid OOVs
 #-----------------------------------------------------------------------------------------------
 print_info "[${BASENAME}] Build extended Speeral Lexicon" 1
 mkdir -p $EXT_LEX/final
 mkdir -p $EXT_LEX/tmp
 mkdir -p $EXT_LEX/tmp/txt
 #
 # Collect the acousticly found oov and their phonetisation
 #
 print_info "[${BASENAME}] Get all OOV and retrieve all phonetisation" 3
 for i in `ls $SOLR_RES/*.phon_oov`
 do
     basename=`basename $i .phon_oov`
     rm $EXT_LEX/$basename.acousticlyfound 2> /dev/null
     # list acousticly found for the show
     for baseseg in $(cat "$SHOW_DIR/$basename.lst")
     do
         cat $TRIGGER_CONFZONE/$baseseg.acousticlyfound | cut -f1 | cut -f2 -d"=" >> $EXT_LEX/$basename.acousticlyfound
     done
     cat $EXT_LEX/$basename.acousticlyfound | sort -u > $EXT_LEX/.tmp
     mv $EXT_LEX/.tmp $EXT_LEX/$basename.acousticlyfound
     #
     # Extract OOV really added
     #
     cat $SOLR_RES/$basename.phon_oov | cut -f1 | sort -u > $EXT_LEX/$basename.oov
     $SCRIPT_PATH/intersec.pl $EXT_LEX/$basename.oov $EXT_LEX/$basename.acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound
     #
     # Retrieve all phonetisation
     #
     cat $SOLR_RES/${basename}.phon_oov | $SCRIPT_PATH/LexPhonFilter.pl $EXT_LEX/$basename.oov_acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound_phon
 done
 #
 # Merge OOVs and their phonetisation
 #
 print_info "[${BASENAME}] Merge OOV and their phonetisation" 3
 lexname=$(basename $LEXICON)
 cat $EXT_LEX/*.oov_acousticlyfound_phon | sort -u > $EXT_LEX/final/all.oov_acousticlyfound_phon
 cat $EXT_LEX/*.oov_acousticlyfound | sort -u | grep --perl-regex -v "^([a-z']){3}$"  > $EXT_LEX/final/all.oov_acousticlyfound
 $SCRIPT_PATH/MergeLexicon.pl $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/final/${lexname}_ext.phon
 #
 # Collect + clean retrieved txt
 #
 print_info "[${BASENAME}] Collect and clean SOLR txt answers" 2
 # choose filter
 # default
 cat $SOLR_RES/*.txt | $SCRIPT_PATH/CleanFilter.sh | $SCRIPT_PATH/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $EXT_LEX/final/all.bdlex_txt
 # low filter
 #cat $SOLR_RES/*.txt | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex > $EXT_LEX/final/all.bdlex_txt
 #
 # Construct the map file
 #
 # Notes:
 # - Expected format :
 #   <WORD1_STRING> <CANDIDATE1_STRING> <PHON_1>
 #
 print_info "[${BASENAME}] Construct map file" 3
 rm -f $EXT_LEX/final/${lexname}_ext.map 2>/dev/null
 rm -f $EXT_LEX/final/${lexname}.unvalid_oov 2>/dev/null
 while read oov
 do
     oov=`echo $oov | sed "s/\n//g"`
     #
     # Obtain the oov's tag
     #
     #oov_tag=`grep --perl-regex "^$oov\t" $DYNAMIC_TAGSTATS/all.tags | cut -f2`
     #
     # Try to collect text containing the oov word
     #
     print_info "[${BASENAME}] Collect text containing the oov" 3
     cat $EXT_LEX/final/all.bdlex_txt | grep --perl-regex " $oov " | $SCRIPT_PATH/NbMaxWordsFilter.pl 40 |uniq > $EXT_LEX/tmp/txt/$oov.bdlex_txt
     if [ -f $EXT_LEX/tmp/txt/$oov.bdlex_txt ]; then
         nbWords=`wc -l $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "`
         if [ $nbWords -eq 0 ]; then
             print_warn "[${BASENAME}] UNVALID OOV: $oov => $nbWords occurrences" 2
             echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov
         else
             #
             # Find a candidate in a filtred invoc lexicon => a candidate which maximize the ppl in the overall txt collected
             #
             #echo "$/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $LEXICON.bdlex_tok $EXT_LEX/tmp/txt/$oov.bdlex_txt"
             print_info `$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` 3
             candidate=`$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "`
             if [ ! "$candidate" == "" ]; then
                 grep --perl-regex "^$oov\t" $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/tmp/$oov.phon
                 while read phonLine
                 do
                     #<word> <phon> => <word> <candidate> <phon>
                     echo "$phonLine" | sed "s|\t|\t$candidate\t|"  >> $EXT_LEX/final/${lexname}_ext.map
                 done < $EXT_LEX/tmp/$oov.phon
             else
                 print_warn "[${BASENAME}] UNVALID OOV: $oov => no availaible Candidate word in LM" 2
                 echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov
             fi
         fi
     else
         print_warn "[${BASENAME}] UNVALID OOV: $oov" 2
         echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov
     fi
 done < $EXT_LEX/final/all.oov_acousticlyfound
 #
 ### Speeral
 #
 lexname=`basename $LEXICON`
 #
 # Build the final trigger file
 #
 print_info "[${BASENAME}] Clean trigg files" 3
 mkdir -p $TRIGGER_CONFZONE/speeral/ 2> /dev/null
 mkdir -p $EXT_LEX/speeral/ 2> /dev/null
 for i in `ls $TRIGGER_CONFZONE/*.trigg`
 do
     basename=`basename $i .trigg`
     cat $i | $SCRIPT_PATH/RemoveLineContaining.pl $EXT_LEX/$lexname.unvalid_oov > $TRIGGER_CONFZONE/speeral/$basename.trigg
 done
 #
 # Compile the speeral extended lexicon
 #
 print_info "[${BASENAME}] Compile Speeral extended lexicon" 3
 print_info "$SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext" 3
 $SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext
 if [ $CHECK -eq 1 ]
 then
     check_exploitconfpass_lex_check "${EXT_LEX}/speeral/${lexname}_ext"
     if [ $? -eq 1 ]
     then
         print_error "[${BASENAME}] Building Speeral Lexicon $INPUT_DIR -> exit"
         print_error "[${BASENAME}] Check $ERRORFILE"
         print_log_file $ERRORFILE "ERROR : Building Speeral Lexicon $INPUT_DIR"
         print_log_file $ERRORFILE "ERROR : ${EXT_LEX}/speeral/${lexname}_ext Empty after buildmappedbinode ?"
         exit 1;
     fi
 fi
 #-------#
 # CLOSE #
 #-------#
 # Seem OK
 print_info "[${BASENAME}] <= ExploitConfidencePass End | $(date +'%d/%m/%y %H:%M:%S')" 1
 # unlok directory
 mv "$OUTPUT_DIR/EXPLOITCONFPASS.lock" "$OUTPUT_DIR/EXPLOITCONFPASS.unlock"