/* From token°ne files to a STM one */ /* FRED 0309 */ #include #include #include #include #include #include #ifdef LIBXML_TREE_ENABLED /* ................................................................ */ #define TailleLigne 80000 #define True 1 #define False 0 void ERREUR(char *ch1, char *ch2) { fprintf(stderr, "ERREUR : %s %s\n", ch1, ch2); exit(0); } /* ................................................................ */ /* format STM ;; Transcriber export by stm.tcl,v 1.21 on ven mar 06 11:37:51 CET 2009 with encoding ISO-8859-1 ;; transcribed by (null), version 35 of 090306 ;; ;; CATEGORY "0" "" "" ;; LABEL "O" "Overall" "Overall" ;; ;; CATEGORY "1" "Hub4 Focus Conditions" "" ;; LABEL "F0" "Baseline//Broadcast//Speech" "" ;; LABEL "F1" "Spontaneous//Broadcast//Speech" "" ;; LABEL "F2" "Speech Over//Telephone//Channels" "" ;; LABEL "F3" "Speech in the//Presence of//Background Music" "" ;; LABEL "F4" "Speech Under//Degraded//Acoustic Conditions" "" ;; LABEL "F5" "Speech from//Non-Native//Speakers" "" ;; LABEL "FX" "All other speech" "" ;; CATEGORY "2" "Speaker Sex" "" ;; LABEL "female" "Female" "" ;; LABEL "male" "Male" "" ;; LABEL "unknown" "Unknown" "" 20070711_1900_1920_inter 1 excluded_region 0.000 81.085 ignore_time_segment_in_scoring 20070711_1900_1920_inter 1 20070711_1900_1920_inter_speaker_20 81.085 84.582 le 18 20 continue avec le journal de Mickaël Thébault . bonsoir Mick aël . 20070711_1900_1920_inter 1 Mickaël_Thébault 84.582 85.476 bonsoir . 20070711_1900_1920_inter 1 inter_segment_gap 85.476 85.982 20070711_1900_1920_inter 1 Mickaël_Thébault 85.982 91.647 quand l'ouverture sarkozyenne fait des ravages : au PS encore un éléphant quitte le tro upeau . 20070711_1900_1920_inter 1 Mickaël_Thébault 91.647 96.691 Jack Lang démissionne des instances dirigeantes du parti , réactions à gauche et à droi te dans un instant . 20070711_1900_1920_inter 1 Mickaël_Thébault 96.691 104.302 [r] la peine maximale pour Pierrot le fou , Pierre Bodein condamné à la réclusion crim inelle à perpétuité dont 30 ans incompressibles . format token ;; Transcriber export by stm.tcl,v 1.21 on ven mar 06 11:37:51 CET 2009 with encoding ISO-8859-1 ;; transcribed by (null), version 35 of 090306 ;; ;; CATEGORY "0" "" "" ;; LABEL "O" "Overall" "Overall" ;; ;; CATEGORY "1" "Hub4 Focus Conditions" "" ;; LABEL "F0" "Baseline//Broadcast//Speech" "" ;; LABEL "F1" "Spontaneous//Broadcast//Speech" "" ;; LABEL "F2" "Speech Over//Telephone//Channels" "" ;; LABEL "F3" "Speech in the//Presence of//Background Music" "" ;; LABEL "F4" "Speech Under//Degraded//Acoustic Conditions" "" ;; LABEL "F5" "Speech from//Non-Native//Speakers" "" ;; LABEL "FX" "All other speech" "" ;; CATEGORY "2" "Speaker Sex" "" ;; LABEL "female" "Female" "" ;; LABEL "male" "Male" "" ;; LABEL "unknown" "Unknown" "" le dix huit vingt continue avec le journal de Mickaël Thébault . bonsoir Mickaël . le dix huit vingt continue avec le journal de Mickaël Thébault . bonsoir Mickaël . format ne Mickaël Thébault . bonsoir Mickaël PS Jack Lang Pierrot Pierre Bodein trente ans cour d' assises du Bas_Rhin Air_France Pakistan format stm_ne 20030418_0800_0900_FRANCEINTER_DGA 1 Patrick_Roger 12.793 16.698 les ministres [pers.hum François Fillon ] et [pers.hum Jean-Paul Delevoye ] dévo */ /* ................................................................ */ #define IF_MAJUSCULE(a) (((a)>='A')&&((a)<='Z')) typedef struct { char *token; char *cate; } type_ne; #define MAX_NE 10000 type_ne T_begin_ne[MAX_NE]; type_ne T_end_ne[MAX_NE]; void load_ne(char *chfilene) { FILE *file; char ch[TailleLigne], chcate[100], *chbegin, *chend; int nbbegin, nbend, i, j; if (!(file = fopen(chfilene, "rt"))) ERREUR("can't open:", chfilene); for (nbbegin = nbend = 0; fgets(ch, TailleLigne, file);) { if (!strncmp(ch, "= MAX_NE) || (nbend >= MAX_NE)) ERREUR("cste MAX_NE too small", ""); } } T_begin_ne[nbbegin].token = T_end_ne[nbend].token = NULL; fclose(file); } char * find_cate(char *id, type_ne * tabl) { int i; for (i = 0; (tabl[i].token) && (strcmp(tabl[i].token, id)); i++) /* fprintf(stderr,"XX:[%s * ]\n",tabl[i].token) */ ; if (tabl[i].token) return tabl[i].cate; else return NULL; } /* ................................................................ */ void sprint_word(char *ch, xmlNode * node) { if (node) { if (node->content) { int i, j; char *chin; if ((ch[0]) && (ch[strlen(ch) - 1] != ' ')) strcat(ch, " "); for (i = 0, j = strlen(ch), chin = (char *) node->content; chin[i]; i++) { if (chin[i] != '\n') if ((i > 0) && (chin[i] == ' ') && (chin[i - 1] == ' ')); else ch[j++] = chin[i]; } ch[j] = '\0'; } sprint_word(ch, node->next); sprint_word(ch, node->children); } } void sprint_word_raw(char *ch, xmlNode * node) { if (node) { if (node->content) strcat(ch, (char *) node->content); sprint_word(ch, node->next); sprint_word(ch, node->children); } } char * find_attribute(xmlAttr * ptat, char *name) { for (; (ptat) && (strcmp((char *) (ptat->name), name)); ptat = ptat->next); if ((!ptat) || (ptat->children == NULL) || (ptat->children->content == NULL)) ERREUR("corpus without ", name); return (char *) ptat->children->content; } xmlNode * find_node(xmlNode * a_node, char *name) { xmlNode *cur_node = NULL, *resu; for (cur_node = a_node; cur_node; cur_node = cur_node->next) if ((cur_node->type == XML_ELEMENT_NODE) && (!strcmp((char *)(cur_node->name), name))) return cur_node; else { resu = find_node(cur_node->children, name); if (resu) return resu; } return NULL; } void process_token(xmlNode * a_node) { xmlNode *cur_node = NULL, *pt, *pt2; xmlAttr *ptat; static char ch[TailleLigne], *cate, *newcate; static int interuptus, prevsgml; int i; for (interuptus = prevsgml = False, cate = NULL, cur_node = a_node; cur_node; cur_node = cur_node->next) { if (cur_node->type == XML_ELEMENT_NODE) { if (!strcmp((char *)(cur_node->name), "sentence")) { pt = find_node(cur_node->children, "tokens"); if (!pt) ERREUR("bad format in xml: no 'tokens'", ""); for (pt = pt->children; pt; pt = pt->next) if ((pt->type == XML_ELEMENT_NODE) && (!strcmp((char *)(pt->name), "token"))) { if (!strcmp(find_attribute(pt->properties, "type"), "sgmltag")) { if (cate) { if (!prevsgml) { printf(" ] "); interuptus = True; } } else interuptus = False; strcpy(ch, find_attribute(pt->properties, "content")); for (i = 0; ch[i]; i++) if (ch[i] == '[') ch[i] = '<'; else if (ch[i] == ']') ch[i] = '>'; printf("\n%s", ch); prevsgml = True; } else { ch[0] = '\0'; sprint_word(ch, pt->children); newcate = find_cate(find_attribute(pt->properties, "id"), T_begin_ne); if (newcate) { printf(" [%s", newcate); cate = newcate; } else { if ((cate) && (interuptus)) { printf(" [%s", cate); interuptus = False; } } printf(" %s", ch); newcate = find_cate(find_attribute(pt->properties, "id"), T_end_ne); if (newcate) { printf(" ] "); cate = NULL; interuptus = False; } prevsgml = False; } } } } process_token(cur_node->children); } } /* ................................................................ */ int main(int argc, char **argv) { char ch[TailleLigne], *chfilene; xmlDoc *doc = NULL; xmlNode *root_element, *ptnode; int nb; /* * this initialize the library and check potential ABI mismatches * between the version it was compiled for and the actual shared * library used. */ LIBXML_TEST_VERSION chfilene = NULL; if (argc > 1) for (nb = 1; nb < argc; nb++) if (!strcmp(argv[nb], "-tk")) { if (nb + 1 == argc) ERREUR("an option must follow option:", argv[nb]); if (!(doc = xmlReadFile(argv[++nb], NULL, 0))) ERREUR("could not parse file:", argv[nb]); } else if (!strcmp(argv[nb], "-ne")) { if (nb + 1 == argc) ERREUR("an option must follow option:", argv[nb]); chfilene = argv[++nb]; } else if (!strcmp(argv[nb], "-h")) { fprintf(stderr, "Syntax: %s [-h] -doc -ne \n", argv[0]); exit(0); } else ERREUR("unknown option:", argv[nb]); if ((!doc) || (!chfilene)) ERREUR("bad syntax, check '-h'", ""); load_ne(chfilene); /* Get the root element node */ root_element = xmlDocGetRootElement(doc); ptnode = find_node(root_element, "Header_STM"); ch[0] = '\0'; sprint_word_raw(ch, ptnode->children); printf("%s", ch); ptnode = find_node(root_element, "Token"); process_token(ptnode); /* free the document */ xmlFreeDoc(doc); /* *Free the global variables that may *have been allocated by the parser. */ xmlCleanupParser(); return 0; } #else int main(void) { fprintf(stderr, "Tree support not compiled in\n"); exit(1); } #endif