Blame view
tools/sctk-2.4.10/src/utf_filt/utf-1.2.dtd
9.59 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 |
<!SGML "ISO 8879:1986" -- File: @(#)utf-1.2.dtd Mar 31, 2004 Authors: Paul Morgovsky and Milan Young Linguistic Data Consortium, University of Pennsylvania. Henry S. Thompson, Language Technology Group University of Edinburgh Jon Fiscus Spoken Natural Language Processing Group NIST Desc: SGML and DTD declaration for the new specifications for the Transcription of Spoken Language. Numerous changes were made to enable named entity tagging and ASR tagging to co-exist. This dtd is also annotated with comments, which when ran through the appropriate PERL script, will result is a DTD without active shortrefs. Revision History: - nothing yet - 11/02/99 Added the NOMEX tag. JGF - 03/31/04 Dave Graff and Jon Fiscus conspired to update the DTD for Arabic Usage: nsgmls utf.dtd filename -- CHARSET BASESET "ISO 646-1983//CHARSET International Reference Version (IRV)//ESC 2/5 4/0" DESCSET 0 9 UNUSED -- NUL,SOH,STX,ETX,ETO,ENQ,ACK,BEL,BS -- 9 2 9 11 2 UNUSED -- VT,FF -- 13 1 13 14 18 UNUSED -- SO,SI,DLE,DC1,DC2 -- 32 95 32 127 1 UNUSED -- del character -- BASESET "ISO 646-1983//CHARSET International Reference Version (IRV)//ESC 2/5 4/0" DESCSET 128 32 UNUSED 160 1 UNUSED 161 65373 161 CAPACITY PUBLIC "ISO 8879:1986//CAPACITY Reference//EN" SCOPE DOCUMENT SYNTAX SHUNCHAR CONTROLS 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 127 160 BASESET "ISO 646-1983//CHARSET International Reference Version (IRV)//ESC 2/5 4/0" DESCSET 0 128 0 FUNCTION RE 13 RS 10 SPACE 32 TAB SEPCHAR 9 NAMING LCNMSTRT "" UCNMSTRT "" LCNMCHAR "_-." UCNMCHAR "_-." NAMECASE GENERAL YES ENTITY NO DELIM GENERAL SGMLREF SHORTREF NONE "*" "+" "^" "%" "@" "," "." "?" "{" "_" "[" "&#RE;" "&#RE;&#RS;" "&#RE;&#RS;B" "&#RE;B" "&#RE;B&#RS;" "&#RS;" "&#RS;&#RE;" "&#RS;&#RE;B" "&#RS;B" "&#RS;B&#RE;" "B" "B&#RE;" "B&#RE;&#RS;" "B&#RS;" "B&#RS;&#RE;" NAMES SGMLREF QUANTITY SGMLREF NAMELEN 99999999 PILEN 24000 TAGLEN 99999999 TAGLVL 99999999 FEATURES MINIMIZE DATATAG NO OMITTAG YES RANK YES SHORTTAG YES LINK SIMPLE YES 1000 IMPLICIT YES EXPLICIT YES 1 OTHER CONCUR NO SUBDOC YES 99999999 FORMAL YES APPINFO NONE> <!-- This dtd has bee augmented with comments, which, after applying the following filter will dissable shortrefs in the DTD. Thus, text tokens are not parsed by sgml but is instead left to the application. perl -pe 'if (/DELETE TO DISABLE SHORTREF/) { ($_ = " ") } elsif (/BEGIN COMMENT TO/) { ($_ = "< ! - - ")=~s/ //g; } elsif (/END COMMENT TO/) { ($_ = "- -> ")=~s/ //g; } ' --> <!DOCTYPE utf [ <!-- Quick Substitution Entities --> <!-- BEGIN COMMENT TO DISABLE SHORTREF --> <!ENTITY % textTokens "(separator | pName | mispronounced | misspelling | acronym | idiosyncratic | nonlexeme | nonSpeech | period | qmark | comma | contraction | fragment | hyphen | acousticnoise | wtime | #PCDATA )" > <!-- END COMMENT TO DISABLE SHORTREF --> <!-- DELETE TO DISABLE SHORTREF <!ENTITY % textTokens "(#PCDATA | contraction | fragment | hyphen | wtime )" > DELETE TO DISABLE SHORTREF --> <!ENTITY % ne_bound "( b_enamex | e_enamex | b_timex | e_timex | b_numex | e_numex | b_nomex | e_nomex )" > <!ENTITY % asr_bound "( b_foreign | e_foreign | b_unclear | e_unclear | b_overlap | e_overlap | b_noscore | e_noscore | b_aside | e_aside )" > <!ENTITY NONSPEECH "<nonSpeech>" > <!ENTITY ACOUSTICNOISE "<acousticnoise>" > <!ENTITY SEP "<separator>" > <!ENTITY PNAME "<pName>" > <!ENTITY MISPRONOUNCED "<mispronounced>" > <!ENTITY MISSPELLING "<misspelling>" > <!ENTITY ACRONYM "<acronym>" > <!ENTITY IDIOSYNCRATIC "<idiosyncratic>" > <!ENTITY NONLEXEME "<nonlexeme>" > <!ENTITY PERIOD "<period>" > <!ENTITY QMARK "<qmark>" > <!ENTITY COMMA "<comma>" > <!ENTITY IGNORE ""> <!-- Document Grammar Specifications --> <!-- Structural definition --> <!ELEMENT utf - - ( bn_episode_trans | conversation_trans ) > <!ELEMENT bn_episode_trans - - (section | recording_change | background)+ > <!ELEMENT section - - (turn | background)* > <!ELEMENT conversation_trans - - (turn | background)* > <!ELEMENT recording_change - O EMPTY > <!ELEMENT turn - - ( %textTokens; | time | background | %ne_bound; | %asr_bound; )+ > <!ELEMENT separator - O EMPTY > <!-- Floating elements --> <!ELEMENT background - O EMPTY > <!ELEMENT time - O EMPTY > <!ELEMENT wtime - O EMPTY > <!-- Bouunding tags made explicitly --> <!ELEMENT b_foreign - O EMPTY > <!ELEMENT b_unclear - O EMPTY > <!ELEMENT b_overlap - O EMPTY > <!ELEMENT b_noscore - O EMPTY > <!ELEMENT b_aside - O EMPTY > <!ELEMENT e_foreign - O EMPTY > <!ELEMENT e_unclear - O EMPTY > <!ELEMENT e_overlap - O EMPTY > <!ELEMENT e_noscore - O EMPTY > <!ELEMENT e_aside - O EMPTY > <!ELEMENT b_enamex - O EMPTY > <!ELEMENT b_timex - O EMPTY > <!ELEMENT b_numex - O EMPTY > <!ELEMENT b_nomex - O EMPTY > <!ELEMENT e_enamex - O EMPTY > <!ELEMENT e_timex - O EMPTY > <!ELEMENT e_numex - O EMPTY > <!ELEMENT e_nomex - O EMPTY > <!-- Applied word tags --> <!ELEMENT fragment - O EMPTY > <!ELEMENT contraction - O EMPTY > <!-- Shortref elements --> <!ELEMENT pName - O EMPTY > <!ELEMENT mispronounced - O EMPTY > <!ELEMENT misspelling - O EMPTY > <!ELEMENT acronym - O EMPTY > <!ELEMENT idiosyncratic - O EMPTY > <!ELEMENT nonlexeme - O EMPTY > <!ELEMENT nonSpeech - O EMPTY > <!ELEMENT acousticnoise - O EMPTY > <!ELEMENT period - O EMPTY > <!ELEMENT qmark - O EMPTY > <!ELEMENT comma - O EMPTY > <!ELEMENT hyphen - O EMPTY > <!-- Attributes of the Tags --> <!ATTLIST utf dtd_version (utf-1.0|utf-1.1|utf-1.2) #REQUIRED audio_filename CDATA #REQUIRED language CDATA #REQUIRED scribe CDATA #IMPLIED version NUMBER #IMPLIED version_date CDATA #IMPLIED> <!ATTLIST bn_episode_trans program CDATA #REQUIRED air_date CDATA #IMPLIED > <!ATTLIST conversation_trans recording_date CDATA #IMPLIED > <!ATTLIST section type (report|filler|nontrans) #REQUIRED startTime CDATA #REQUIRED endTime CDATA #REQUIRED id CDATA #IMPLIED topic CDATA #IMPLIED > <!ATTLIST recording_change show CDATA #REQUIRED date CDATA #REQUIRED sec CDATA #REQUIRED > <!ATTLIST turn speaker CDATA #REQUIRED spkrtype (male|female|child|unknown) #REQUIRED dialect CDATA #IMPLIED startTime CDATA #REQUIRED endTime CDATA #REQUIRED mode (planned|spontaneous) #IMPLIED channel CDATA #IMPLIED fidelity (low|medium|high) #IMPLIED > <!ATTLIST b_noscore startTime CDATA #REQUIRED endTime CDATA #REQUIRED reason CDATA CDATA > <!ATTLIST b_foreign language CDATA #REQUIRED > <!ATTLIST contraction e_form CDATA #REQUIRED > <!ATTLIST b_overlap startTime CDATA #IMPLIED endTime CDATA #IMPLIED > <!ATTLIST time sec CDATA #REQUIRED > <!ATTLIST wtime startTime CDATA #REQUIRED endTime CDATA #REQUIRED clust CDATA #IMPLIED conf CDATA #IMPLIED > <!ATTLIST background startTime CDATA #REQUIRED type (music|speech|other) #REQUIRED level (off|low|high) #REQUIRED > <!ATTLIST b_enamex type CDATA #REQUIRED status (opt) #IMPLIED alt CDATA #IMPLIED > <!ATTLIST b_timex type CDATA #REQUIRED status (opt) #IMPLIED alt CDATA #IMPLIED > <!ATTLIST b_numex type CDATA #REQUIRED status (opt) #IMPLIED alt CDATA #IMPLIED > <!ATTLIST b_nomex type CDATA #REQUIRED status (opt) #IMPLIED min CDATA #IMPLIED > <!-- Short Refference Mappings --> <!-- BEGIN COMMENT TO DISABLE SHORTREF --> <!SHORTREF TURN '.' PERIOD '?' QMARK ',' COMMA '+' MISPRONOUNCED '@' MISSPELLING '_' ACRONYM '^' PNAME '*' IDIOSYNCRATIC '%' NONLEXEME '{' NONSPEECH '[' ACOUSTICNOISE '&#RS;B&#RE;' IGNORE '&#RS;&#RE;' IGNORE '&#RE;&#RS;' SEP '&#RE;&#RS;B' SEP '&#RE;' SEP '&#RE;B&#RS;' SEP '&#RE;B' SEP '&#RS;&#RE;B' SEP '&#RS;' SEP '&#RS;B' SEP 'B&#RE;&#RS;' SEP 'B&#RE;' SEP 'B&#RS;&#RE;' SEP 'B&#RS;' SEP 'B' SEP > <!USEMAP TURN turn > <!-- END COMMENT TO DISABLE SHORTREF --> ]> |