Blame view

tools/lia_ltbox/lia_phon/src/tagg/extract_name.c 2.41 KB
e6be5137b   Jean-François Rey   reinitialized pro...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
  /*  Extract proper-name from a POS tagged text corpus  */
  
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
  #include <strings.h>
  
  /*................................................................*/
  
  #define TailleLigne	40000
  
  #define True	1
  #define False	0
  
  void ERREUR(char *ch1,char *ch2)
  {
  fprintf(stderr,"ERREUR : %s %s
  ",ch1,ch2);
  exit(0);
  }
  
  /*................................................................*/
  
  #define IF_MAJU(a)	(((a)>='A')&&((a)<='Z'))
  #define NON_ALPHA(a)	(((a)<'A')||(((a)>'Z')&&((a)<'a'))||((a)>'z'))
  
  #define IF_PONCTU(a)	(((a)==',')||((a)=='.')||((a)==';')||((a)==':')||((a)=='!')||((a)=='?'))
  
  #define IF_SEPAR(a)	(((a)=='-')||((a)=='_')||((a)=='&')||((a)=='*')||((a)=='@'))
  
  #define TailleMot	200
  #define WindowSize	10
  
  char T_word[WindowSize][TailleMot];
  char T_tagg[WindowSize][TailleMot];
  char T_flag[WindowSize];
  
  int sure_name(int i)
  {
  if ((T_tagg[i][0]=='X')||((!strcmp(T_tagg[i],"MOTINC"))&&(IF_MAJU(T_word[i][0])))) return True;
  return False;
  }
  
  int find_name()
  {
  int i,pafini;
  for(i=0,pafini=True;(i<WindowSize)&&(pafini);)
   {
   if (sure_name(i)) pafini=True;
   else
    if ((i>0)&&(IF_SEPAR(T_word[i][0]))&&(i<WindowSize-1)&&(sure_name(i+1))) pafini=True;
    else
     if ((i>0)&&((!strcmp(T_word[i],"de"))||(!strcmp(T_word[i],"y")))&&(i<WindowSize-1)&&(sure_name(i+1))) pafini=True;
     else pafini=False;
   if (pafini) i++;
   }
  return i;
  }
  
  void traite_ligne(char *ch)
  {
  static char *lvide="  ";
  char *word,*tagg;
  int nb,i,j;
  
  if (ch)
   {
   word=strtok(ch," \t
  ");
   if (word) tagg=strtok(NULL," \t
  "); else tagg=NULL;
   if ((!word)||(!tagg)) { fprintf(stderr,"ERROR: bad format in input file line %d
  ",nb+1); exit(0); }
   if (strlen(word)>=TailleMot) word[TailleMot-2]='\0';
   if (strlen(tagg)>=TailleMot) tagg[TailleMot-2]='\0';
   }
  else tagg=word=lvide;
  
  for(i=1;i<WindowSize;i++)
   {
   strcpy(T_word[i-1],T_word[i]);
   strcpy(T_tagg[i-1],T_tagg[i]);
   T_flag[i-1]=T_flag[i];
   }
  strcpy(T_word[WindowSize-1],word);
  strcpy(T_tagg[WindowSize-1],tagg);
  T_flag[WindowSize-1]=0;
  
  if (T_flag[0]==0)
   {
   if ((j=find_name())&&((j>1)||(strlen(T_word[0])>1)))
    {
    for(i=0;i<j;i++)
     {
     if (i>0) printf(" ");
     printf("%s",T_word[i]);
     T_flag[i]=1;
    }
    printf("
  ");
    }
   }
  }
  
  int main(int argc, char **argv)
  {
  char ch[TailleLigne];
  int nb;
  
  for (nb=0;nb<WindowSize;nb++) T_flag[nb]=T_word[nb][0]=T_tagg[nb][0]='\0';
  
  for(nb=0;fgets(ch,TailleLigne,stdin);nb++) traite_ligne(ch);
  for(nb=0;nb<WindowSize;nb++) traite_ligne(NULL);
  }