utf-1.2.dtd
9.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
<!SGML "ISO 8879:1986"
--
File: @(#)utf-1.2.dtd Mar 31, 2004
Authors: Paul Morgovsky and Milan Young
Linguistic Data Consortium,
University of Pennsylvania.
Henry S. Thompson,
Language Technology Group
University of Edinburgh
Jon Fiscus
Spoken Natural Language Processing Group
NIST
Desc: SGML and DTD declaration for the new specifications for the
Transcription of Spoken Language.
Numerous changes were made to enable named entity tagging
and ASR tagging to co-exist. This dtd is also annotated
with comments, which when ran through the appropriate PERL
script, will result is a DTD without active shortrefs.
Revision History:
- nothing yet
- 11/02/99 Added the NOMEX tag. JGF
- 03/31/04 Dave Graff and Jon Fiscus conspired to update the DTD for Arabic
Usage:
nsgmls utf.dtd filename
--
CHARSET BASESET "ISO 646-1983//CHARSET
International Reference Version (IRV)//ESC 2/5 4/0"
DESCSET 0 9 UNUSED -- NUL,SOH,STX,ETX,ETO,ENQ,ACK,BEL,BS --
9 2 9
11 2 UNUSED -- VT,FF --
13 1 13
14 18 UNUSED -- SO,SI,DLE,DC1,DC2 --
32 95 32
127 1 UNUSED -- del character --
BASESET "ISO 646-1983//CHARSET
International Reference Version (IRV)//ESC 2/5 4/0"
DESCSET 128 32 UNUSED
160 1 UNUSED
161 65373 161
CAPACITY PUBLIC "ISO 8879:1986//CAPACITY Reference//EN"
SCOPE DOCUMENT
SYNTAX SHUNCHAR CONTROLS 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
18 19 20 21 22 23 24 25 26 27 28 29 30 31 127 160
BASESET "ISO 646-1983//CHARSET International Reference
Version (IRV)//ESC 2/5 4/0"
DESCSET 0 128 0
FUNCTION RE 13
RS 10
SPACE 32
TAB SEPCHAR 9
NAMING LCNMSTRT ""
UCNMSTRT ""
LCNMCHAR "_-."
UCNMCHAR "_-."
NAMECASE GENERAL YES
ENTITY NO
DELIM GENERAL SGMLREF
SHORTREF NONE "*" "+" "^" "%" "@" "," "." "?" "{" "_" "["
"&#RE;"
"&#RE;&#RS;"
"&#RE;&#RS;B"
"&#RE;B"
"&#RE;B&#RS;"
"&#RS;"
"&#RS;&#RE;"
"&#RS;&#RE;B"
"&#RS;B"
"&#RS;B&#RE;"
"B"
"B&#RE;"
"B&#RE;&#RS;"
"B&#RS;"
"B&#RS;&#RE;"
NAMES SGMLREF
QUANTITY SGMLREF
NAMELEN 99999999
PILEN 24000
TAGLEN 99999999
TAGLVL 99999999
FEATURES MINIMIZE DATATAG NO
OMITTAG YES
RANK YES
SHORTTAG YES
LINK SIMPLE YES 1000
IMPLICIT YES
EXPLICIT YES 1
OTHER CONCUR NO
SUBDOC YES 99999999
FORMAL YES
APPINFO NONE>
<!-- This dtd has bee augmented with comments, which, after applying the following
filter will dissable shortrefs in the DTD. Thus, text tokens are not parsed by
sgml but is instead left to the application.
perl -pe 'if (/DELETE TO DISABLE SHORTREF/) { ($_ = "\n") } elsif (/BEGIN COMMENT TO/) { ($_ = "< ! - -\n")=~s/ //g; } elsif (/END COMMENT TO/) { ($_ = "- ->\n")=~s/ //g; } '
-->
<!DOCTYPE utf [
<!-- Quick Substitution Entities -->
<!-- BEGIN COMMENT TO DISABLE SHORTREF -->
<!ENTITY % textTokens "(separator | pName | mispronounced | misspelling | acronym | idiosyncratic | nonlexeme | nonSpeech | period | qmark | comma | contraction | fragment | hyphen | acousticnoise | wtime | #PCDATA )" >
<!-- END COMMENT TO DISABLE SHORTREF -->
<!-- DELETE TO DISABLE SHORTREF
<!ENTITY % textTokens "(#PCDATA | contraction | fragment | hyphen | wtime )" >
DELETE TO DISABLE SHORTREF -->
<!ENTITY % ne_bound "( b_enamex | e_enamex | b_timex | e_timex | b_numex | e_numex | b_nomex | e_nomex )" >
<!ENTITY % asr_bound "( b_foreign | e_foreign | b_unclear | e_unclear | b_overlap | e_overlap | b_noscore | e_noscore | b_aside | e_aside )" >
<!ENTITY NONSPEECH "<nonSpeech>" >
<!ENTITY ACOUSTICNOISE "<acousticnoise>" >
<!ENTITY SEP "<separator>" >
<!ENTITY PNAME "<pName>" >
<!ENTITY MISPRONOUNCED "<mispronounced>" >
<!ENTITY MISSPELLING "<misspelling>" >
<!ENTITY ACRONYM "<acronym>" >
<!ENTITY IDIOSYNCRATIC "<idiosyncratic>" >
<!ENTITY NONLEXEME "<nonlexeme>" >
<!ENTITY PERIOD "<period>" >
<!ENTITY QMARK "<qmark>" >
<!ENTITY COMMA "<comma>" >
<!ENTITY IGNORE "">
<!-- Document Grammar Specifications -->
<!-- Structural definition -->
<!ELEMENT utf - - ( bn_episode_trans | conversation_trans ) >
<!ELEMENT bn_episode_trans
- - (section | recording_change | background)+ >
<!ELEMENT section - - (turn | background)* >
<!ELEMENT conversation_trans - - (turn | background)* >
<!ELEMENT recording_change - O EMPTY >
<!ELEMENT turn - - ( %textTokens; | time | background | %ne_bound; | %asr_bound; )+ >
<!ELEMENT separator - O EMPTY >
<!-- Floating elements -->
<!ELEMENT background - O EMPTY >
<!ELEMENT time - O EMPTY >
<!ELEMENT wtime - O EMPTY >
<!-- Bouunding tags made explicitly -->
<!ELEMENT b_foreign - O EMPTY >
<!ELEMENT b_unclear - O EMPTY >
<!ELEMENT b_overlap - O EMPTY >
<!ELEMENT b_noscore - O EMPTY >
<!ELEMENT b_aside - O EMPTY >
<!ELEMENT e_foreign - O EMPTY >
<!ELEMENT e_unclear - O EMPTY >
<!ELEMENT e_overlap - O EMPTY >
<!ELEMENT e_noscore - O EMPTY >
<!ELEMENT e_aside - O EMPTY >
<!ELEMENT b_enamex - O EMPTY >
<!ELEMENT b_timex - O EMPTY >
<!ELEMENT b_numex - O EMPTY >
<!ELEMENT b_nomex - O EMPTY >
<!ELEMENT e_enamex - O EMPTY >
<!ELEMENT e_timex - O EMPTY >
<!ELEMENT e_numex - O EMPTY >
<!ELEMENT e_nomex - O EMPTY >
<!-- Applied word tags -->
<!ELEMENT fragment - O EMPTY >
<!ELEMENT contraction - O EMPTY >
<!-- Shortref elements -->
<!ELEMENT pName - O EMPTY >
<!ELEMENT mispronounced - O EMPTY >
<!ELEMENT misspelling - O EMPTY >
<!ELEMENT acronym - O EMPTY >
<!ELEMENT idiosyncratic - O EMPTY >
<!ELEMENT nonlexeme - O EMPTY >
<!ELEMENT nonSpeech - O EMPTY >
<!ELEMENT acousticnoise - O EMPTY >
<!ELEMENT period - O EMPTY >
<!ELEMENT qmark - O EMPTY >
<!ELEMENT comma - O EMPTY >
<!ELEMENT hyphen - O EMPTY >
<!-- Attributes of the Tags -->
<!ATTLIST utf dtd_version (utf-1.0|utf-1.1|utf-1.2) #REQUIRED
audio_filename CDATA #REQUIRED
language CDATA #REQUIRED
scribe CDATA #IMPLIED
version NUMBER #IMPLIED
version_date CDATA #IMPLIED>
<!ATTLIST bn_episode_trans
program CDATA #REQUIRED
air_date CDATA #IMPLIED >
<!ATTLIST conversation_trans
recording_date CDATA #IMPLIED >
<!ATTLIST section type (report|filler|nontrans) #REQUIRED
startTime CDATA #REQUIRED
endTime CDATA #REQUIRED
id CDATA #IMPLIED
topic CDATA #IMPLIED >
<!ATTLIST recording_change show CDATA #REQUIRED
date CDATA #REQUIRED
sec CDATA #REQUIRED >
<!ATTLIST turn speaker CDATA #REQUIRED
spkrtype (male|female|child|unknown) #REQUIRED
dialect CDATA #IMPLIED
startTime CDATA #REQUIRED
endTime CDATA #REQUIRED
mode (planned|spontaneous) #IMPLIED
channel CDATA #IMPLIED
fidelity (low|medium|high) #IMPLIED >
<!ATTLIST b_noscore startTime CDATA #REQUIRED
endTime CDATA #REQUIRED
reason CDATA CDATA >
<!ATTLIST b_foreign language CDATA #REQUIRED >
<!ATTLIST contraction e_form CDATA #REQUIRED >
<!ATTLIST b_overlap startTime CDATA #IMPLIED
endTime CDATA #IMPLIED >
<!ATTLIST time sec CDATA #REQUIRED >
<!ATTLIST wtime startTime CDATA #REQUIRED
endTime CDATA #REQUIRED
clust CDATA #IMPLIED
conf CDATA #IMPLIED >
<!ATTLIST background startTime CDATA #REQUIRED
type (music|speech|other) #REQUIRED
level (off|low|high) #REQUIRED >
<!ATTLIST b_enamex type CDATA #REQUIRED
status (opt) #IMPLIED
alt CDATA #IMPLIED >
<!ATTLIST b_timex type CDATA #REQUIRED
status (opt) #IMPLIED
alt CDATA #IMPLIED >
<!ATTLIST b_numex type CDATA #REQUIRED
status (opt) #IMPLIED
alt CDATA #IMPLIED >
<!ATTLIST b_nomex type CDATA #REQUIRED
status (opt) #IMPLIED
min CDATA #IMPLIED >
<!-- Short Refference Mappings -->
<!-- BEGIN COMMENT TO DISABLE SHORTREF -->
<!SHORTREF TURN '.' PERIOD
'?' QMARK
',' COMMA
'+' MISPRONOUNCED
'@' MISSPELLING
'_' ACRONYM
'^' PNAME
'*' IDIOSYNCRATIC
'%' NONLEXEME
'{' NONSPEECH
'[' ACOUSTICNOISE
'&#RS;B&#RE;' IGNORE
'&#RS;&#RE;' IGNORE
'&#RE;&#RS;' SEP
'&#RE;&#RS;B' SEP
'&#RE;' SEP
'&#RE;B&#RS;' SEP
'&#RE;B' SEP
'&#RS;&#RE;B' SEP
'&#RS;' SEP
'&#RS;B' SEP
'B&#RE;&#RS;' SEP
'B&#RE;' SEP
'B&#RS;&#RE;' SEP
'B&#RS;' SEP
'B' SEP >
<!USEMAP TURN turn >
<!-- END COMMENT TO DISABLE SHORTREF -->
]>