tagg2text.c
2.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
/*
* Output a text corpus with NE tags from a tagged corpus with some
* rewritting rules
*/
/* FRED1106 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
/* ................................................................ */
#define TailleLigne 180000
#define True 1
#define False 0
void
ERREUR(char *ch1, char *ch2)
{
fprintf(stderr, "ERREUR : %s %s\n", ch1, ch2);
exit(0);
}
/* ................................................................ */
#define IFCAPITAL(a) (((a)>='A')&&((a)<='Z'))
char T_separateur[] = {'(', ')', ',', ';', '!', ':', '0'};
int
if_separateur(char c)
{
int i;
for (i = 0; (T_separateur[i] != '0') && (T_separateur[i] != c); i++);
return T_separateur[i] == '0' ? False : True;
}
/*
* format:
*
* de PREPADE O la DETFS O Loi MOTINC O sur PREP O le
* DETMS O Parlement MOTINC B-loc du PREPDU I-loc Canada
* XPAYMS I-loc
*
*/
int
potential_cpn(char *ch)
{
if ((IFCAPITAL(ch[0])) && (strstr(ch, "--")))
return True;
return False;
}
int
main(int argc, char **argv)
{
char ch[TailleLigne], prev[TailleLigne], *pt1, *pt2,
*pt3;
int nb;
/*
if (argc>1)
for(nb=1;nb<argc;nb++)
if (!strcmp(argv[nb],"-XXXX"))
{
if (nb+1==argc) ERREUR("an option must follow option:",argv[nb]);
XXXX
}
else
if (!strcmp(argv[nb],"-h"))
{
fprintf(stderr,"Syntax: %s [-h]\n",argv[0]);
exit(0);
}
else ERREUR("unknown option:",argv[nb]);
*/
prev[0] = '\0';
printf("<s> ");
fgets(ch, TailleLigne, stdin);
for (nb = 0; !feof(stdin); nb++)
{
if ((nb + 1) % 100000 == 0) fprintf(stderr, "En cours : %d\n", nb + 1);
if ((ch[0] == '\0') || (ch[0] == '\n') || (!strncmp(ch,"--LB--",6)))
{
if (prev[0]) printf("</%s> ", prev);
/*before if (!strncmp(pt1,"--LB--",6)) { printf("</s>\n<s> --LB-- </s>\n<s> "); fgets(ch, TailleLigne, stdin); }*/
if (!strncmp(ch,"--LB--",6)) { printf("</s>\n<s> --LB-- </s>\n<s> "); fgets(ch, TailleLigne, stdin); }
else if ((fgets(ch, TailleLigne, stdin)) && (!if_separateur(ch[0]))) printf("</s>\n<s> ");
prev[0] = '\0';
}
else
{
pt1 = pt2 = pt3 = NULL;
pt1 = strtok(ch, " \t\n");
if (pt1) pt2 = strtok(NULL, " \t\n");
if (pt2) pt3 = strtok(NULL, " \t\n");
if ((!pt1) || (!pt2) || (!pt3)) ERREUR("bad format:", ch);
if (pt3[0] == 'O')
{
if (prev[0]) printf("</%s> ", prev);
prev[0] = '\0';
/* correction: for the PN compound with '--' if (potential_cpn(pt1)) { strcpy(prev, "gsp"); printf("<%s> ", prev); } */
}
else
{
if (strcmp(prev, pt3 + 2))
{
if (prev[0]) printf("</%s> ", prev);
strcpy(prev, pt3 + 2);
printf("<%s> ", prev);
}
}
if (!strcmp(pt1, "</s>")) printf("</s>\n");
else printf("%s ", pt1);
fgets(ch, TailleLigne, stdin);
}
}
printf("</s>\n");
exit(0);
}