from_token2ctm_ne.c 12.4 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
/* From token°ne files to a STM one  */
/* FRED 0309 - MODIF GERALDINE 0710 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <libxml/parser.h>
#include <libxml/tree.h>

#ifdef LIBXML_TREE_ENABLED

/* ................................................................ */

#define TailleLigne     80000

#define True    1
#define False   0

void 
ERREUR(char *ch1, char *ch2)
{
	fprintf(stderr, "ERREUR : %s %s\n", ch1, ch2);
	exit(0);
}

/* ................................................................ */

/*
format CTM-NE

20041006_0700_0800_CLASSIQUE 1 1.98 0.19 bonne
20041006_0700_0800_CLASSIQUE 1 2.17 0.54 journée
20041006_0700_0800_CLASSIQUE 1 2.71 0.19 bon
20041006_0700_0800_CLASSIQUE 1 2.90 0.82 courage
20041006_0700_0800_CLASSIQUE 1 3.72 0.27 pour
20041006_0700_0800_CLASSIQUE 1 3.99 0.20 ce
20041006_0700_0800_CLASSIQUE 1 4.19 0.46 mercredi--time--1
20041006_0700_0800_CLASSIQUE 1 4.65 0.25 six--time--1
20041006_0700_0800_CLASSIQUE 1 4.90 0.44 octobre--time--1
20041006_0700_0800_CLASSIQUE 1 5.34 0.31 dans
20041006_0700_0800_CLASSIQUE 1 5.65 0.15 une
20041006_0700_0800_CLASSIQUE 1 5.80 0.33 petite
20041006_0700_0800_CLASSIQUE 1 6.13 0.48 minute
20041006_0700_0800_CLASSIQUE 1 6.61 0.17 le
20041006_0700_0800_CLASSIQUE 1 6.78 0.34 journal
20041006_0700_0800_CLASSIQUE 1 7.12 0.10 de
20041006_0700_0800_CLASSIQUE 1 7.22 0.05 l'
20041006_0700_0800_CLASSIQUE 1 7.27 0.42 économie
20041006_0700_0800_CLASSIQUE 1 7.69 0.42 deuxième
20041006_0700_0800_CLASSIQUE 1 8.11 0.43 édition
20041006_0700_0800_CLASSIQUE 1 8.54 0.10 mais
20041006_0700_0800_CLASSIQUE 1 8.64 0.47 tout_de_suite
20041006_0700_0800_CLASSIQUE 1 9.11 0.17 les
20041006_0700_0800_CLASSIQUE 1 9.28 0.19 grands
20041006_0700_0800_CLASSIQUE 1 9.47 0.31 titres
20041006_0700_0800_CLASSIQUE 1 9.78 0.11 de
20041006_0700_0800_CLASSIQUE 1 9.89 0.05 l'
20041006_0700_0800_CLASSIQUE 1 9.94 0.62 actualité
20041006_0700_0800_CLASSIQUE 1 10.56 0.27 mode
20041006_0700_0800_CLASSIQUE 1 10.83 0.29 bayeux--loc--2
20041006_0700_0800_CLASSIQUE 1 11.12 0.55 bonjour
20041006_0700_0800_CLASSIQUE 1 11.75 0.37 bonjour

format token

<?xml version="1.0" encoding="UTF-8"?>
<Token audio_filename="XXXX" type="MANUAL" asr="NONE" version_date="march2009">
<sentence id="s000001">
<text>le dix huit vingt continue avec le journal de Mickaël Thébault . bonsoir Mickaël . </text>
<tokens count="17">
        <token id="s000001_t0001"  type="sgmltag" content="20070711_1900_1920_inter 1 excluded_region 0.000 81.085 [o,,unknown] ignore_time_segment_in_scorin
g"/>
        <token id="s000001_t0002"  type="sgmltag" content="20070711_1900_1920_inter 1 20070711_1900_1920_inter_speaker_20 81.085 84.582 [o,fx,male]"/>
        <token type="wtoken" id="s000001_t0003">le</token>
        <token type="wtoken" id="s000001_t0004">dix</token>
        <token type="wtoken" id="s000001_t0005">huit</token>
        <token type="wtoken" id="s000001_t0006">vingt</token>
        <token type="wtoken" id="s000001_t0007">continue</token>
        <token type="wtoken" id="s000001_t0008">avec</token>
        <token type="wtoken" id="s000001_t0009">le</token>
        <token type="wtoken" id="s000001_t0010">journal</token>
        <token type="wtoken" id="s000001_t0011">de</token>
        <token type="wtoken" id="s000001_t0012">Mickaël</token>
        <token type="wtoken" id="s000001_t0013">Thébault</token>
        <token type="wtoken" id="s000001_t0014">.</token>
        <token type="wtoken" id="s000001_t0015">bonsoir</token>
        <token type="wtoken" id="s000001_t0016">Mickaël</token>
        <token type="wtoken" id="s000001_t0017">.</token>
</tokens>
</sentence>

format ne

<NE ne_tagger="LIA_NE" type="AUTO" audio_filename="XXXX" version="01" date="????" token_filename="./dev_v3.4/stm_dev_v3.4/20070711_1900_1920_inter.tk2">
<ne id="s000002_ne0001" token="s000002_t0016 s000002_t0017 s000002_t0018 s000002_t0019 s000002_t0020" cat="LOC"> Mickaël Thébault . bonsoir Mickaël </ne>
<ne id="s000005_ne0001" token="s000005_t0016" cat="ORG"> PS </ne>
<ne id="s000006_ne0001" token="s000006_t0007 s000006_t0008" cat="PERS"> Jack Lang </ne>
<ne id="s000007_ne0001" token="s000007_t0011" cat="ORG"> Pierrot </ne>
<ne id="s000007_ne0002" token="s000007_t0015 s000007_t0016" cat="PERS"> Pierre Bodein </ne>
<ne id="s000007_ne0003" token="s000007_t0023 s000007_t0024" cat="AMOUNT"> trente ans </ne>
<ne id="s000008_ne0001" token="s000008_t0012 s000008_t0013 s000008_t0014 s000008_t0015 s000008_t0016" cat="LOC"> cour d' assises du Bas_Rhin </ne>
<ne id="s000009_ne0001" token="s000009_t0011" cat="ORG"> Air_France </ne>
<ne id="s000011_ne0001" token="s000011_t0008" cat="LOC"> Pakistan </ne>

format stm_ne

20030418_0800_0900_FRANCEINTER_DGA 1 Patrick_Roger 12.793 16.698 <o,f3,male> les ministres [pers.hum François Fillon ] et [pers.hum Jean-Paul Delevoye ] dévo

*/

/* ................................................................ */

#define IF_MAJUSCULE(a)	(((a)>='A')&&((a)<='Z'))

typedef struct {
	char           *token;
	char           *cate;
}               type_ne;

#define MAX_NE	10000

type_ne         T_begin_ne[MAX_NE];
type_ne         T_end_ne[MAX_NE];

void 
load_ne(char *chfilene)
{
	FILE           *file;
	char            ch[TailleLigne], chcate[100], *chbegin, *chend;
	int             nbbegin, nbend, i, j;

	if (!(file = fopen(chfilene, "rt")))
		ERREUR("can't open:", chfilene);
	for (nbbegin = nbend = 0; fgets(ch, TailleLigne, file);) {
		if (!strncmp(ch, "<ne ", 4)) {
			chcate[0] = '\0';
			chbegin = chend = NULL;
			for (i = 0; ch[i]; i++)
				if (!strncmp(ch + i, "cat=", 4)) {
					for (j = 0, i += 5; (ch[i]) && (ch[i] != '"'); i++, j++)
						chcate[j] = IF_MAJUSCULE(ch[i]) ? ch[i] + ('a' - 'A') : ch[i];
					if (!ch[i])
						ERREUR("bad format1 token2ctm:", ch);
					chcate[j] = '\0';
				} else if (!strncmp(ch + i, "token=", 6)) {
					chbegin = ch + i + 7;
					for (i += 7; (ch[i]) && (ch[i] != ' ') && (ch[i] != '"'); i++);
					if (!ch[i])
						ERREUR("bad format2:", ch);
					if (ch[i] == '"') {
						chend = chbegin;
						ch[i] = '\0';
					} else {
						ch[i++] = '\0';
						for (chend = ch + i; (ch[i]) && (ch[i] != '"'); i++)
							if (ch[i] == ' ')
								chend = ch + i;
						if (!ch[i])
							ERREUR("bad format3:", ch);
						ch[i] = '\0';
						while ((*chend) && (*chend == ' '))
							chend++;
					}
				}
			if ((!chcate[0]) || (!chbegin) || (!chend))
				ERREUR("bad format4:", ch);
			/*
			 * fprintf(stderr,"chbegin=[%s]
			 * chend=[%s]\n",chbegin,chend);
			 */

			T_begin_ne[nbbegin].token = strdup(chbegin);
			T_begin_ne[nbbegin].cate = strdup(chcate);
			T_end_ne[nbend].token = strdup(chend);
			T_end_ne[nbbegin].cate = strdup(chcate);
			nbbegin++;
			nbend++;
			if ((nbbegin >= MAX_NE) || (nbend >= MAX_NE))
				ERREUR("cste MAX_NE too small", "");
		}
	}
	T_begin_ne[nbbegin].token = T_end_ne[nbend].token = NULL;
	fclose(file);
}

char           *
find_cate(char *id, type_ne * tabl)
{
	int             i;
	for (i = 0; (tabl[i].token) && (strcmp(tabl[i].token, id)); i++)	/* fprintf(stderr,"XX:[%s
										 * ]\n",tabl[i].token) */
		;
	if (tabl[i].token)
		return tabl[i].cate;
	else
		return NULL;
}

/* ................................................................ */

void 
sprint_word(char *ch, xmlNode * node)
{
	if (node) {
		if (node->content) {
			int             i, j;
			char           *chin;
			if ((ch[0]) && (ch[strlen(ch) - 1] != ' '))
				strcat(ch, " ");
			for (i = 0, j = strlen(ch), chin = (char *) node->content; chin[i]; i++) {
				if (chin[i] != '\n')
					if ((i > 0) && (chin[i] == ' ') && (chin[i - 1] == ' '));
					else
						ch[j++] = chin[i];
			}
			ch[j] = '\0';
		}
		sprint_word(ch, node->next);
		sprint_word(ch, node->children);
	}
}

void 
sprint_word_raw(char *ch, xmlNode * node)
{
	if (node) {
		if ((node->content) && (strcmp((char *) node->content, "\n")))
			strcat(ch, (char *) node->content);
		sprint_word(ch, node->next);
		sprint_word(ch, node->children);
	}
}

char           *
find_attribute(xmlAttr * ptat, char *name)
{
	for (; (ptat) && (strcmp((char *) (ptat->name), name)); ptat = ptat->next);
	if ((!ptat) || (ptat->children == NULL) || (ptat->children->content == NULL))
		ERREUR("corpus without ", name);
	return (char *) ptat->children->content;
}

int
if_attribute(xmlAttr * ptat, char *name)
{
	for (; (ptat) && (strcmp((char *) (ptat->name), name)); ptat = ptat->next);
	if ((!ptat) || (ptat->children == NULL) || (ptat->children->content == NULL))
		return 0;
	return 1;
}


xmlNode        *
find_node(xmlNode * a_node, char *name)
{
	xmlNode        *cur_node = NULL, *resu;
	for (cur_node = a_node; cur_node; cur_node = cur_node->next)
		if ((cur_node->type == XML_ELEMENT_NODE) && (!strcmp(cur_node->name, name)))
			return cur_node;
		else {
			resu = find_node(cur_node->children, name);
			if (resu)
				return resu;
		}
	return NULL;
}

xmlNode        *
next_node(xmlNode * pt)
{
	while ((pt) && ((pt->type != XML_ELEMENT_NODE) || (strcmp(pt->name, "token"))))
		pt = pt->next;
	return pt;
}

void 
process_token(xmlNode * a_node, int *nben)
{
	xmlNode        *cur_node = NULL, *pt, *pt2;
	xmlAttr        *ptat;
	static char     ch[TailleLigne], *cate = NULL, *newcate , chendline[TailleLigne];
	static int      prevsgml = False;
	int             i;

	for (cur_node = a_node; cur_node; cur_node = cur_node->next) {
		if (cur_node->type == XML_ELEMENT_NODE) {
			if (!strcmp(cur_node->name, "sentence")) {
				pt = find_node(cur_node->children, "tokens");
				if (!pt)
					ERREUR("bad format in xml: no 'tokens'", "");
				for (pt = pt->children; pt; pt = pt->next)
					if ((pt->type == XML_ELEMENT_NODE) && (!strcmp(pt->name, "token"))) {
						if (!strcmp(find_attribute(pt->properties, "type"), "sgmltag")) {
						        chendline[0] = '\0';
							strcpy(ch, find_attribute(pt->properties, "content"));

							printf("%s", ch);

							if (if_attribute(pt->properties, "endline"))
							        strcpy(chendline, find_attribute(pt->properties, "endline"));
							prevsgml = True;
						} else {
							ch[0] = '\0';
							sprint_word(ch, pt->children);
							newcate = find_cate(find_attribute(pt->properties, "id"), T_begin_ne);
							if (newcate) {
								(*nben)++;
								cate = newcate;
							}
							if (prevsgml)
								printf(" ");
							else 
								printf("_");

							printf("%s", ch);

							pt2 = next_node(pt->next);	/* if (pt2) {
											 * printf("POPO:
											 * type=%s\n",find_attrib
											 * ute(pt2->properties,"t
											 * ype")); } */
							if ((!pt2) || (!strcmp(find_attribute(pt2->properties, "type"), "sgmltag"))) {
							  if (cate) 
							    printf("--%s--%d", cate, *nben);
							  if (chendline[0]) 
							    printf("%s", chendline);
							  printf("\n");
							}
							newcate = find_cate(find_attribute(pt->properties, "id"), T_end_ne);
							if (newcate)
								cate = NULL;

							prevsgml = False;
						}
					}
			}
		}
		process_token(cur_node->children, nben);
	}
}

/* ................................................................ */

int 
main(int argc, char **argv)
{
	char            ch[TailleLigne], *chfilene;
	xmlDoc         *doc = NULL;
	xmlNode        *root_element, *ptnode;
	int             nb;

	/*
	* this initialize the library and check potential ABI mismatches
	* between the version it was compiled for and the actual shared
	* library used.
	*/
	LIBXML_TEST_VERSION
		chfilene = NULL;
	if (argc > 1)
		for (nb = 1; nb < argc; nb++)
			if (!strcmp(argv[nb], "-tk")) {
				if (nb + 1 == argc)
					ERREUR("an option must follow option:", argv[nb]);
				if (!(doc = xmlReadFile(argv[++nb], NULL, 0)))
					ERREUR("could not parse file:", argv[nb]);
			} else if (!strcmp(argv[nb], "-ne")) {
				if (nb + 1 == argc)
					ERREUR("an option must follow option:", argv[nb]);
				chfilene = argv[++nb];
			} else if (!strcmp(argv[nb], "-h")) {
				fprintf(stderr, "Syntax: %s [-h] -tk <file xml> -ne <output>\n", argv[0]);
				exit(0);
			} else
				ERREUR("unknown option:", argv[nb]);

	if ((!doc) || (!chfilene))
		ERREUR("bad syntax, check '-h'", "");

	load_ne(chfilene);

	/* Get the root element node */
	root_element = xmlDocGetRootElement(doc);

	ptnode = find_node(root_element, "Header_CTM");
	ch[0] = '\0';
	sprint_word_raw(ch, ptnode->children);
	if (ch[0] == '\n')
		printf("%s", ch + 1);
	else
		printf("%s", ch);

	ptnode = find_node(root_element, "Token");
	nb = 0;
	process_token(ptnode, &nb);

	/* free the document */
	xmlFreeDoc(doc);
	/*
	*Free the global variables that may
	*have been allocated by the parser.
	*/
	xmlCleanupParser();
	return 0;
}
#else
int 
main(void)
{
	fprintf(stderr, "Tree support not compiled in\n");
	exit(1);
}
#endif