/* From a TOKEN file with POS file to a CRF++ training file */ /* FRED 0309 */ #include #include #include #include /* ................................................................ */ #define TailleLigne 8000 #define True 1 #define False 0 void ERREUR(char *ch1, char *ch2) { fprintf(stderr, "ERREUR : %s %s\n", ch1, ch2); exit(0); } #define SI_MINUSCULE(a) (((a)>='a')&&((a)<='z')) /* ................................................................ */ /* * Format: * * file .tk * * merci � tous d' �couter France_Inter euh vous * �tes en ce moment un million deux cent mille � �tre branch�s sur * l' antenne de France_Inter , et_puis ce qui nous fait plaisir aussi , c' * est que le petit matin marche tr�s_bien . * merci * � tous * * file .pos * * merci � tous d' �couter France_Inter * euh vous �tes en ce * * * CRF++ training: * * Patricia XPREF B-pers Martin XFAMIL I-pers que COSUB O voici PREP O que COSUB * O voilà PREP O * */ void analyze_pos(char *ch, char **chid, char **chpos, char **chword) { int i; for (i = 0; (ch[i]) && (strncmp(ch + i, "token=", 6)); i++); if (!ch[i]) ERREUR("bad POS string:", ch); (*chid) = ch + i + 7; for (i += 7; (ch[i]) && (ch[i] != '"'); i++); if (!ch[i]) ERREUR("bad POS string:", ch); ch[i] = '\0'; for (++i; (ch[i]) && (strncmp(ch + i, "pos=", 4)); i++); if (!ch[i]) ERREUR("bad POS string:", ch); (*chpos) = ch + i + 5; for (i += 5; (ch[i]) && (ch[i] != '"'); i++); if (!ch[i]) ERREUR("bad POS string:", ch); ch[i] = '\0'; for (++i; (ch[i]) && (ch[i] != '>'); i++); if (!ch[i]) ERREUR("bad POS string:", ch); for (++i; (ch[i]) && (ch[i] == ' '); i++); (*chword) = ch + i; for (++i; (ch[i]) && (ch[i] != ' ') && (ch[i] != '<'); i++); if (!ch[i]) ERREUR("bad POS string:", ch); ch[i] = '\0'; } /* * analyse NE */ void analyze_tag(char *ch, char **desc, char **type, char **extent) { int i; *desc = *type = *extent = NULL; for (i = 0; ch[i]; i++) if (!strncmp(ch + i, "desc=", 5)) { *desc = ch + i + 6; for (i += 6; (ch[i]) && (ch[i] != '"'); i++); if (!ch[i]) ERREUR("bad format:", ch); ch[i] = '\0'; } else if (!strncmp(ch + i, "type=", 5)) { *type = ch + i + 6; for (i += 6; (ch[i]) && (ch[i] != '"'); i++); if (!ch[i]) ERREUR("bad format:", ch); ch[i] = '\0'; } else if (!strncmp(ch + i, "extent=", 7)) { *extent = ch + i + 8; for (i += 8; (ch[i]) && (ch[i] != '"'); i++); if (!ch[i]) ERREUR("bad format:", ch); ch[i] = '\0'; } } int main(int argc, char **argv) { char ch[TailleLigne], ch2[TailleLigne], *chid, *chpos, *chword, *chdesc, *chtype, *chextent, chNE[100]; int nb, insent, i, j, beginNE; FILE *file_tk, *file_pos; file_tk = file_pos = NULL; if (argc > 1) for (nb = 1; nb < argc; nb++) if (!strcmp(argv[nb], "-tk")) { if (nb + 1 == argc) ERREUR("an option must follow option:", argv[nb]); if (!(file_tk = fopen(argv[++nb], "rt"))) ERREUR("can't open:", argv[nb]); } else if (!strcmp(argv[nb], "-pos")) { if (nb + 1 == argc) ERREUR("an option must follow option:", argv[nb]); if (!(file_pos = fopen(argv[++nb], "rt"))) ERREUR("can't open:", argv[nb]); } else if (!strcmp(argv[nb], "-h")) { fprintf(stderr, "Syntax: %s [-h] -tk -pos \n", argv[0]); exit(0); } else ERREUR("unknown option:", argv[nb]); if ((!file_tk) || (!file_pos)) ERREUR("bad syntax, check '-h'", ""); while ((fgets(ch2, TailleLigne, file_pos)) && (!strstr(ch2, "