/* From token°ne files to a STM one */ /* FRED 0309 - MODIF GERALDINE 0710 */ #include #include #include #include #include #include #ifdef LIBXML_TREE_ENABLED /* ................................................................ */ #define TailleLigne 80000 #define True 1 #define False 0 void ERREUR(char *ch1, char *ch2) { fprintf(stderr, "ERREUR : %s %s\n", ch1, ch2); exit(0); } /* ................................................................ */ /* format CTM-NE 20041006_0700_0800_CLASSIQUE 1 1.98 0.19 bonne 20041006_0700_0800_CLASSIQUE 1 2.17 0.54 journée 20041006_0700_0800_CLASSIQUE 1 2.71 0.19 bon 20041006_0700_0800_CLASSIQUE 1 2.90 0.82 courage 20041006_0700_0800_CLASSIQUE 1 3.72 0.27 pour 20041006_0700_0800_CLASSIQUE 1 3.99 0.20 ce 20041006_0700_0800_CLASSIQUE 1 4.19 0.46 mercredi--time--1 20041006_0700_0800_CLASSIQUE 1 4.65 0.25 six--time--1 20041006_0700_0800_CLASSIQUE 1 4.90 0.44 octobre--time--1 20041006_0700_0800_CLASSIQUE 1 5.34 0.31 dans 20041006_0700_0800_CLASSIQUE 1 5.65 0.15 une 20041006_0700_0800_CLASSIQUE 1 5.80 0.33 petite 20041006_0700_0800_CLASSIQUE 1 6.13 0.48 minute 20041006_0700_0800_CLASSIQUE 1 6.61 0.17 le 20041006_0700_0800_CLASSIQUE 1 6.78 0.34 journal 20041006_0700_0800_CLASSIQUE 1 7.12 0.10 de 20041006_0700_0800_CLASSIQUE 1 7.22 0.05 l' 20041006_0700_0800_CLASSIQUE 1 7.27 0.42 économie 20041006_0700_0800_CLASSIQUE 1 7.69 0.42 deuxième 20041006_0700_0800_CLASSIQUE 1 8.11 0.43 édition 20041006_0700_0800_CLASSIQUE 1 8.54 0.10 mais 20041006_0700_0800_CLASSIQUE 1 8.64 0.47 tout_de_suite 20041006_0700_0800_CLASSIQUE 1 9.11 0.17 les 20041006_0700_0800_CLASSIQUE 1 9.28 0.19 grands 20041006_0700_0800_CLASSIQUE 1 9.47 0.31 titres 20041006_0700_0800_CLASSIQUE 1 9.78 0.11 de 20041006_0700_0800_CLASSIQUE 1 9.89 0.05 l' 20041006_0700_0800_CLASSIQUE 1 9.94 0.62 actualité 20041006_0700_0800_CLASSIQUE 1 10.56 0.27 mode 20041006_0700_0800_CLASSIQUE 1 10.83 0.29 bayeux--loc--2 20041006_0700_0800_CLASSIQUE 1 11.12 0.55 bonjour 20041006_0700_0800_CLASSIQUE 1 11.75 0.37 bonjour format token le dix huit vingt continue avec le journal de Mickaël Thébault . bonsoir Mickaël . le dix huit vingt continue avec le journal de Mickaël Thébault . bonsoir Mickaël . format ne Mickaël Thébault . bonsoir Mickaël PS Jack Lang Pierrot Pierre Bodein trente ans cour d' assises du Bas_Rhin Air_France Pakistan format stm_ne 20030418_0800_0900_FRANCEINTER_DGA 1 Patrick_Roger 12.793 16.698 les ministres [pers.hum François Fillon ] et [pers.hum Jean-Paul Delevoye ] dévo */ /* ................................................................ */ #define IF_MAJUSCULE(a) (((a)>='A')&&((a)<='Z')) typedef struct { char *token; char *cate; } type_ne; #define MAX_NE 10000 type_ne T_begin_ne[MAX_NE]; type_ne T_end_ne[MAX_NE]; void load_ne(char *chfilene) { FILE *file; char ch[TailleLigne], chcate[100], *chbegin, *chend; int nbbegin, nbend, i, j; if (!(file = fopen(chfilene, "rt"))) ERREUR("can't open:", chfilene); for (nbbegin = nbend = 0; fgets(ch, TailleLigne, file);) { if (!strncmp(ch, "= MAX_NE) || (nbend >= MAX_NE)) ERREUR("cste MAX_NE too small", ""); } } T_begin_ne[nbbegin].token = T_end_ne[nbend].token = NULL; fclose(file); } char * find_cate(char *id, type_ne * tabl) { int i; for (i = 0; (tabl[i].token) && (strcmp(tabl[i].token, id)); i++) /* fprintf(stderr,"XX:[%s * ]\n",tabl[i].token) */ ; if (tabl[i].token) return tabl[i].cate; else return NULL; } /* ................................................................ */ void sprint_word(char *ch, xmlNode * node) { if (node) { if (node->content) { int i, j; char *chin; if ((ch[0]) && (ch[strlen(ch) - 1] != ' ')) strcat(ch, " "); for (i = 0, j = strlen(ch), chin = (char *) node->content; chin[i]; i++) { if (chin[i] != '\n') if ((i > 0) && (chin[i] == ' ') && (chin[i - 1] == ' ')); else ch[j++] = chin[i]; } ch[j] = '\0'; } sprint_word(ch, node->next); sprint_word(ch, node->children); } } void sprint_word_raw(char *ch, xmlNode * node) { if (node) { if ((node->content) && (strcmp((char *) node->content, "\n"))) strcat(ch, (char *) node->content); sprint_word(ch, node->next); sprint_word(ch, node->children); } } char * find_attribute(xmlAttr * ptat, char *name) { for (; (ptat) && (strcmp((char *) (ptat->name), name)); ptat = ptat->next); if ((!ptat) || (ptat->children == NULL) || (ptat->children->content == NULL)) ERREUR("corpus without ", name); return (char *) ptat->children->content; } int if_attribute(xmlAttr * ptat, char *name) { for (; (ptat) && (strcmp((char *) (ptat->name), name)); ptat = ptat->next); if ((!ptat) || (ptat->children == NULL) || (ptat->children->content == NULL)) return 0; return 1; } xmlNode * find_node(xmlNode * a_node, char *name) { xmlNode *cur_node = NULL, *resu; for (cur_node = a_node; cur_node; cur_node = cur_node->next) if ((cur_node->type == XML_ELEMENT_NODE) && (!strcmp(cur_node->name, name))) return cur_node; else { resu = find_node(cur_node->children, name); if (resu) return resu; } return NULL; } xmlNode * next_node(xmlNode * pt) { while ((pt) && ((pt->type != XML_ELEMENT_NODE) || (strcmp(pt->name, "token")))) pt = pt->next; return pt; } void process_token(xmlNode * a_node, int *nben) { xmlNode *cur_node = NULL, *pt, *pt2; xmlAttr *ptat; static char ch[TailleLigne], *cate = NULL, *newcate , chendline[TailleLigne]; static int prevsgml = False; int i; for (cur_node = a_node; cur_node; cur_node = cur_node->next) { if (cur_node->type == XML_ELEMENT_NODE) { if (!strcmp(cur_node->name, "sentence")) { pt = find_node(cur_node->children, "tokens"); if (!pt) ERREUR("bad format in xml: no 'tokens'", ""); for (pt = pt->children; pt; pt = pt->next) if ((pt->type == XML_ELEMENT_NODE) && (!strcmp(pt->name, "token"))) { if (!strcmp(find_attribute(pt->properties, "type"), "sgmltag")) { chendline[0] = '\0'; strcpy(ch, find_attribute(pt->properties, "content")); printf("%s", ch); if (if_attribute(pt->properties, "endline")) strcpy(chendline, find_attribute(pt->properties, "endline")); prevsgml = True; } else { ch[0] = '\0'; sprint_word(ch, pt->children); newcate = find_cate(find_attribute(pt->properties, "id"), T_begin_ne); if (newcate) { (*nben)++; cate = newcate; } if (prevsgml) printf(" "); else printf("_"); printf("%s", ch); pt2 = next_node(pt->next); /* if (pt2) { * printf("POPO: * type=%s\n",find_attrib * ute(pt2->properties,"t * ype")); } */ if ((!pt2) || (!strcmp(find_attribute(pt2->properties, "type"), "sgmltag"))) { if (cate) printf("--%s--%d", cate, *nben); if (chendline[0]) printf("%s", chendline); printf("\n"); } newcate = find_cate(find_attribute(pt->properties, "id"), T_end_ne); if (newcate) cate = NULL; prevsgml = False; } } } } process_token(cur_node->children, nben); } } /* ................................................................ */ int main(int argc, char **argv) { char ch[TailleLigne], *chfilene; xmlDoc *doc = NULL; xmlNode *root_element, *ptnode; int nb; /* * this initialize the library and check potential ABI mismatches * between the version it was compiled for and the actual shared * library used. */ LIBXML_TEST_VERSION chfilene = NULL; if (argc > 1) for (nb = 1; nb < argc; nb++) if (!strcmp(argv[nb], "-tk")) { if (nb + 1 == argc) ERREUR("an option must follow option:", argv[nb]); if (!(doc = xmlReadFile(argv[++nb], NULL, 0))) ERREUR("could not parse file:", argv[nb]); } else if (!strcmp(argv[nb], "-ne")) { if (nb + 1 == argc) ERREUR("an option must follow option:", argv[nb]); chfilene = argv[++nb]; } else if (!strcmp(argv[nb], "-h")) { fprintf(stderr, "Syntax: %s [-h] -tk -ne \n", argv[0]); exit(0); } else ERREUR("unknown option:", argv[nb]); if ((!doc) || (!chfilene)) ERREUR("bad syntax, check '-h'", ""); load_ne(chfilene); /* Get the root element node */ root_element = xmlDocGetRootElement(doc); ptnode = find_node(root_element, "Header_CTM"); ch[0] = '\0'; sprint_word_raw(ch, ptnode->children); if (ch[0] == '\n') printf("%s", ch + 1); else printf("%s", ch); ptnode = find_node(root_element, "Token"); nb = 0; process_token(ptnode, &nb); /* free the document */ xmlFreeDoc(doc); /* *Free the global variables that may *have been allocated by the parser. */ xmlCleanupParser(); return 0; } #else int main(void) { fprintf(stderr, "Tree support not compiled in\n"); exit(1); } #endif