generate_example_kws.sh
2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen)
# Apache 2.0.
if [ $# -ne 2 ]; then
echo "Usage: local/generate_example_kws.sh <data-dir> <kws-data-dir>"
echo " e.g.: local/generate_example_kws.sh data/test_eval92/ <data/kws>"
exit 1;
fi
datadir=$1;
kwsdatadir=$2;
text=$datadir/text;
mkdir -p $kwsdatadir;
# Generate keywords; we generate 20 unigram keywords with at least 20 counts,
# 20 bigram keywords with at least 10 counts and 10 trigram keywords with at
# least 5 counts.
cat $text | perl -e '
%unigram = ();
%bigram = ();
%trigram = ();
while(<>) {
chomp;
@col=split(" ", $_);
shift @col;
for($i = 0; $i < @col; $i++) {
# unigram case
if (!defined($unigram{$col[$i]})) {
$unigram{$col[$i]} = 0;
}
$unigram{$col[$i]}++;
# bigram case
if ($i < @col-1) {
$word = $col[$i] . " " . $col[$i+1];
if (!defined($bigram{$word})) {
$bigram{$word} = 0;
}
$bigram{$word}++;
}
# trigram case
if ($i < @col-2) {
$word = $col[$i] . " " . $col[$i+1] . " " . $col[$i+2];
if (!defined($trigram{$word})) {
$trigram{$word} = 0;
}
$trigram{$word}++;
}
}
}
$max_count = 100;
$total = 20;
$current = 0;
$min_count = 20;
while ($current < $total && $min_count <= $max_count) {
foreach $x (keys %unigram) {
if ($unigram{$x} == $min_count) {
print "$x\n";
$unigram{$x} = 0;
$current++;
}
if ($current == $total) {
last;
}
}
$min_count++;
}
$total = 20;
$current = 0;
$min_count = 4;
while ($current < $total && $min_count <= $max_count) {
foreach $x (keys %bigram) {
if ($bigram{$x} == $min_count) {
print "$x\n";
$bigram{$x} = 0;
$current++;
}
if ($current == $total) {
last;
}
}
$min_count++;
}
$total = 10;
$current = 0;
$min_count = 3;
while ($current < $total && $min_count <= $max_count) {
foreach $x (keys %trigram) {
if ($trigram{$x} == $min_count) {
print "$x\n";
$trigram{$x} = 0;
$current++;
}
if ($current == $total) {
last;
}
}
$min_count++;
}
' > $kwsdatadir/raw_keywords.txt
echo "Keywords generation succeeded"