/* Convert a CTM file into a TOKEN (MACAON-like) format */ /* FRED 0309 */ #include #include #include #include #include #include /* ................................................................ */ #define TailleLigne 80000 #define True 1 #define False 0 void ERREUR(char *ch1, char *ch2) { fprintf(stderr, "ERREUR : %s %s\n", ch1, ch2); exit(0); } /* ................................................................ */ /* * Format: * * CTM format: [ ] * * 20041006_0700_0800_CLASSIQUE 1 1.98 0.19 bonne 20041006_0700_0800_CLASSIQUE 1 * 2.17 0.54 journée 20041006_0700_0800_CLASSIQUE 1 2.71 0.19 bon * 20041006_0700_0800_CLASSIQUE 1 2.90 0.82 courage * 20041006_0700_0800_CLASSIQUE 1 3.72 0.27 pour 20041006_0700_0800_CLASSIQUE * 1 3.99 0.20 ce 20041006_0700_0800_CLASSIQUE 1 4.19 0.46 mercredi--time--1 * 20041006_0700_0800_CLASSIQUE 1 4.65 0.25 six--time--1 * 20041006_0700_0800_CLASSIQUE 1 4.90 0.44 octobre--time--1 * 20041006_0700_0800_CLASSIQUE 1 5.34 0.31 dans 20041006_0700_0800_CLASSIQUE * 1 5.65 0.15 une 20041006_0700_0800_CLASSIQUE 1 5.80 0.33 petite * * * TOKEN Format: et ce(2) * qui fût bon j' en(2) mange les hasards d' un(2) ange(2) de(2) * [carillon] et ce qui fût bon j' en mange les hasards d' un ange de [carillon] * */ /* ................................................................ */ #define MAX_SIZE_FILE 20000 #define MAX_SIZE_WORD 100 #define MAX_SIZE_SEGM 30000 int justspace(char *ch) { int i; for (i = 0; (ch[i]) && ((ch[i] == ' ') || (ch[i] == '\t') || (ch[i] == '\n')); i++); return ch[i] ? False : True; } int load_xxxx(FILE * file, int i, char t_segm[MAX_SIZE_SEGM][MAX_SIZE_WORD], char **chidxx) { static char ch[TailleLigne], *pt; if (!fgets(ch, TailleLigne, file)) ERREUR("no more XXXX", ""); pt = strtok(ch, " \t\n"); (*chidxx) = pt; for (pt = strtok(NULL, " \t\n"); pt; pt = strtok(NULL, " \t\n")) { if (i == MAX_SIZE_SEGM) ERREUR("cste MAX_SIZE_SEGM too small", ""); if (!strcmp(pt, "&")) strcpy(t_segm[i++], "et"); else if (!strcmp(pt, "&")) strcpy(t_segm[i++], "et"); else if (strcmp(pt, "_")) strcpy(t_segm[i++], pt); } return i; } /* ................................................................ */ #define THRESHOLD1 200 #define THRESHOLD2 400 char T_punct_s[] = { '.', '!', '?', ':', /* separateur 'classique' de fin de phrase */ '\0' }; char T_punct_w[] = { ';', ',', '<', '>', '"', '\0' }; int if_ponct(char c, char *tabl) { static int n; for (n = 0; tabl[n]; n++) if (c == tabl[n]) return 1; return 0; } int only_ponct_strong(char *ch) { int i; for (i = 0; (ch[i]) && (if_ponct(ch[i], T_punct_s)); i++); return ch[i] ? False : True; } int only_ponct_weak(char *ch) { int i; for (i = 0; (ch[i]) && (if_ponct(ch[i], T_punct_w)); i++); return ch[i] ? False : True; } int only_ponct(char *ch) { return (only_ponct_strong(ch)) || (only_ponct_weak(ch)); } /* ................................................................ */ /* CTM format: [ ] */ void find_time(char *chtag, double *begin, double *end) { static char ch[TailleLigne], *pt; double duration; int i; for (i = 0; (chtag[i]) && (strncmp(chtag + i, "content=", 8)); i++); if (!chtag[i]) ERREUR("bad format:", chtag); strcpy(ch, chtag + i + 9); pt = strtok(ch, " \t\n"); if (!pt) ERREUR("bad format:", ch); pt = strtok(NULL, " \t\n"); if (!pt) ERREUR("bad format:", ch); pt = strtok(NULL, " \t\n"); if (!pt) ERREUR("bad format:", ch); if (sscanf(pt, "%lf", begin) != 1) ERREUR("bad format:", pt); pt = strtok(NULL, " \t\n"); if (!pt) ERREUR("bad format:", ch); if (sscanf(pt, "%lf", &duration) != 1) ERREUR("bad format:", pt); *end = (*begin) + duration; } int main(int argc, char **argv) { char *chname, *chasr, *chsource, *chcanal, *chdebut, *chduree, *chmot, chtrans[TailleLigne]; char ch[TailleLigne], *pt, ch2[TailleLigne], t_file[MAX_SIZE_FILE][MAX_SIZE_WORD], t_segm[MAX_SIZE_SEGM][MAX_SIZE_WORD], *chidxx; int nb, idsent, idtoken, itime, i_sent, i, j, i_segm, if_minu, i_deb, i_fin, ignore, nbtoken; FILE *filout, *filein, *filetemp; double begin, end, lastend; struct tms time_struct; chname = NULL; if_minu=False; if (argc > 1) for (nb = 1; nb < argc; nb++) if (!strcmp(argv[nb], "-name")) { if (nb + 1 == argc) ERREUR("an option must follow option:", argv[nb]); chname = argv[++nb]; } else if (!strcmp(argv[nb],"-nocap")) { if_minu=True; } else if (!strcmp(argv[nb], "-h")) { fprintf(stderr, "Syntax: %s [-h] [-name -asr -nocap]\n", argv[0]); exit(0); } else ERREUR("unknown option:", argv[nb]); filout = stdout; filein = stdin; fprintf(filout, "\n"); fprintf(filout, "\n", chname ? chname : "XXXX", chasr ? "AUTO" : "MANUAL", chasr ? chasr : "NONE"); fprintf(filout, "\n"); while ((fgets(ch, TailleLigne, filein)) && (ch[0] == ';')) fprintf(filout, "%s", ch); fprintf(filout, "\n"); itime = (int) (times(&time_struct)); sprintf(ch2, "tmp%d.txt", itime); if (!(filetemp = fopen(ch2, "wt"))) ERREUR("can't write in:", ch2); for (i_segm = 0, idsent = 1, nb = 1, idtoken = 0; !feof(filein); nb++) { if (i_sent > MAX_SIZE_FILE - 100) ERREUR("cste MAX_SIZE_FILE too small", ""); strtok(ch, "\n"); chsource = strtok(ch, " \t\n"); if (!chsource) ERREUR("bad format source:", ch); chcanal = strtok(NULL, " \t\n"); if (!chcanal) ERREUR("bad format canal:", ch); chdebut = strtok(NULL, " \t\n"); if (!chdebut) ERREUR("bad format debut:", ch); chduree = strtok(NULL, " \t\n"); if (!chduree) ERREUR("bad format duree:", ch); chmot = strtok(NULL, " \t\n"); if (!chmot) ERREUR("bad format mot:", ch); sprintf(t_file[i_sent], "\n", chsource, chcanal, chdebut, chduree); i_sent++; sprintf(t_file[i_sent++], "XXXX%d", i_segm); fprintf(filetemp, "XXXX%d ", i_segm++); fprintf(filetemp, "%s\n", chmot); fgets(ch, TailleLigne, filein); } fclose(filetemp); /* sprintf(ch2, "cat tmp%d.txt | sed 's/-/ /g' | sed 's/_/ /g' | sed 's/\\&/et/g' | $LIA_BIGLEX_NE/script/lia_clean.biglex_ne | \ $LIA_NE/src/onelinex | sed 's/ _ / /g' > tmp%d.txt2", itime, itime); */ sprintf(ch2, "cat tmp%d.txt | $LIA_NE/script/clean_txt.csh %s > tmp%d.txt2", itime, if_minu?"-nocap":"",itime); system(ch2); /* * now we process the table of strings with the file processed by * lia_clean and we fill t_segm */ /* * XXXX0 sept heures * XXXX1 à l' écoute d' */ sprintf(ch2, "tmp%d.txt2", itime); if (!(filetemp = fopen(ch2, "rt"))) ERREUR("can't read:", ch2); for (i_segm = i = 0; i < i_sent; i++) { if (i_segm > MAX_SIZE_FILE - 100) ERREUR("cste MAX_SIZE_FILE too small", ""); if (!strncmp(t_file[i], "XXXX", 4)) { i_segm = load_xxxx(filetemp, i_segm, t_segm, &chidxx); if (strcmp(t_file[i], chidxx)) ERREUR("mismatch:", t_file[i]); } else strcpy(t_segm[i_segm++], t_file[i]); } fclose(filetemp); sprintf(ch2, "rm tmp%d.txt tmp%d.txt2", itime, itime); system(ch2); /* now we find the 'sentences' */ for (chtrans[0] = '\0', idsent = idtoken = nbtoken = i_deb = i = 0; i_deb < i_segm;) { if (!strncmp(t_segm[i], "")) || (((i - i_deb) > THRESHOLD1) && (only_ponct(t_segm[i]))) || ((i - i_deb) > THRESHOLD2) || ((begin - lastend) > 0.4)) { /* debut de phrase */ if ((i < i_segm) && (t_segm[i][0] != '<')) strcat(chtrans, t_segm[i]); if ((i < i_segm) && (strcmp(t_segm[i], ""))) nbtoken++; fprintf(filout, "\n", ++idsent); fprintf(filout, "%s\n", chtrans); fprintf(filout, "\n", nbtoken); for (j = i_deb; (j < i_segm) && (j <= i); j++) if (strcmp(t_segm[j], "")) { if (!strncmp(t_segm[j], "%s\n", idsent, ++idtoken, t_segm[j]); } fprintf(filout, "\n"); fprintf(filout, "\n"); i_deb = i + 1; nbtoken = 0; chtrans[0] = '\0'; } else { if ((i < i_segm) && (t_segm[i][0] != '<')) { strcat(chtrans, t_segm[i]); strcat(chtrans, " "); } nbtoken++; } if (i < i_segm) i++; lastend = end; } fprintf(filout, "\n"); exit(0); }