Yannick Estève / ONTRAC-Kaldi

Blame view

tools/sctk-2.4.10/src/sclite/stm.c 7.76 KB
  #include "sctk.h"
  
  STM *alloc_STM(int n){
      STM *ts;
      alloc_singarr(ts,1,STM);
      ts->max = n;
      ts->num = 0;
      ts->s = 0;
      alloc_singarr(ts->seg,ts->max,STM_SEG);
      return(ts);
  }
  
  void expand_STM(STM *stm){
      expand_singarr(stm->seg,stm->num,stm->max,2,STM_SEG);
  }    
  
  void free_STM_SEG(STM_SEG *seg){
      if (seg == (STM_SEG *)0)
  	return;
      TEXT_free(seg->file);
      TEXT_free(seg->spkr);
      TEXT_free(seg->chan);
      TEXT_free(seg->text);
      if (seg->labels != (TEXT *)0) TEXT_free(seg->labels);
  }
  
  void free_STM(STM *stm){
      int i;
      for (i=0; i<stm->num; i++)
  	free_STM_SEG(&(stm->seg[i]));
      free_singarr(stm->seg,STM_SEG);
      free_singarr(stm,STM);
  }
  
  void read_stm_line(TEXT **buf, int *len, FILE *fp){
      **buf  = *(*buf+1) = COMMENT_CHAR;
      while (!feof(fp) && (TEXT_is_comment(*buf) ||
  			 TEXT_is_comment_info(*buf) ||
  			 TEXT_is_empty(*buf))){
  	if (TEXT_ensure_fgets(buf, len, fp) == NULL)
  	    **buf = '\0';
      }
  }
  
  
  void parse_stm_line(STM_SEG *seg, TEXT **buf_ptr, int *buf_len, int case_sense, int dbg){
      TEXT *rp, *buf;
      int i, len;
  
      if (**buf_ptr == '\0')
  	return;
  
      if (!case_sense)
          TEXT_str_case_change_with_mem_expand(buf_ptr, buf_len, 1);
  	
      buf = *buf_ptr;
  
      len = TEXT_strlen(buf);
  
      /* parse stm string */
      rp = TEXT_strtok(buf,(TEXT *)" \t
  ");     	seg->file = TEXT_strdup(rp);
      rp = TEXT_strtok((TEXT *)0,(TEXT *)" \t
  ");  seg->chan = TEXT_strdup(rp);
      rp = TEXT_strtok((TEXT *)0,(TEXT *)" \t
  ");  seg->spkr = TEXT_strdup(rp);
      rp = TEXT_strtok((TEXT *)0,(TEXT *)" \t
  ");  seg->t1   = TEXT_atof(rp);
      rp = TEXT_strtok((TEXT *)0,(TEXT *)" \t
  ");  seg->t2   = TEXT_atof(rp);
      seg->flag1 = 0;
      seg->labels = (TEXT *)0;
      if (rp != (TEXT *)0){
  	/* snag the next token, if it's a set identifier, load it, */
  	/* if not, overwrite the ending null with a space provided */
  	/* there is data beyond the end of this token */
  	if (((rp = TEXT_strtok((TEXT *)0,(TEXT *)" \t
  ")) != (TEXT *)0) &&
  	    (*rp == '<')) {
  	    seg->labels = TEXT_strdup(rp);
  	    rp = TEXT_strtok((TEXT *)0,(TEXT *)" \t
  ");
  	} else 
  	    seg->labels = (TEXT *)0;
  	if (rp != (TEXT *)0){
  	    /* reclaim the first token from the text */
  	    if (rp + TEXT_strlen(rp) < buf + len)
  		*(rp + TEXT_strlen(rp)) = ' ';
  	    seg->text = TEXT_strdup(rp);
  	} else
  	    seg->text = TEXT_strdup((TEXT *)"");
      } else 
  	seg->text = TEXT_strdup((TEXT *)"");
  
      /* was    seg->text = TEXT_strdup(rp + TEXT_strlen(rp) + 1); */
  
      if (dbg){
  	printf("Parsed: file: %s
  ",seg->file);
  	printf("        chan: %s
  ",seg->chan);
  	printf("        spkr: %s
  ",seg->spkr);
  	printf("        t1  : %f
  ",seg->t1);
  	printf("        t2  : %f
  ",seg->t2);
  	printf("        text: %s
  ",seg->text);
      }
      i = TEXT_strlen(seg->text) - 1;
      if (i > 0 && seg->text[i] == '
  ') 
  	seg->text[i] = (TEXT)0; 
  }
  
  void fill_STM(FILE *fp, STM *stm, char *fname, boolean *end_of_file, int case_sense, int *perr){
      static int len=100;
      int i;
      TEXT *buf;
      int dbg=0;
  
      alloc_singZ(buf,len,TEXT,(TEXT)0);
      *perr = 0;
  
      /* if the data doesn't begin at 1, copy it down and go from there */
      if (stm->s > 1){
  	/* first free the already alloc'd data */
  	for (i=0; i<stm->s; i++){
  	    TEXT_free(stm->seg[i].file);
  	    TEXT_free(stm->seg[i].spkr);
  	    TEXT_free(stm->seg[i].chan);
  	    TEXT_free(stm->seg[i].text);
  	    if (stm->seg[i].labels != (TEXT *)0)
  		TEXT_free(stm->seg[i].labels);
  	}
         
  	/* then copy down the residual data */
  	for (i=stm->s; i<=stm->num; i++)
  	    stm->seg[i - stm->s] = stm->seg[i];
  
  	stm->num = stm->num - stm->s;
  	stm->s=0;
      }
  
      /* now read in the data */
      while (!feof(fp) && stm->num+1 < stm->max){
  	read_stm_line(&buf,&len,fp);
  	if (dbg) printf("STM Read %s
  ",buf);
  
  	if (*buf != NULL_TEXT){
  	    parse_stm_line(&(stm->seg[stm->num]), &buf, &len, case_sense, dbg);
  	    stm->num++;	
  	}
      }
      *end_of_file = feof(fp);
      free_singarr(buf,TEXT);
  }
  
  
  void locate_STM_boundary(STM *stm, int start, int by_file, int by_chan, int *end){
      int w;
      int limit=-1;
      int tchg, cchg;
   
      if (start == stm->num){
          *end = start;
          return;
      }
      for (w=start; w<stm->num && limit == -1; w++){
          tchg = (!by_file) ? 1 : 
              (TEXT_strcmp(stm->seg[start].file,stm->seg[w].file) == 0);
          cchg = (!by_chan) ? 1 : 
              (TEXT_strcmp(stm->seg[start].chan,stm->seg[w].chan) == 0);
          if (!(tchg && cchg))
              limit = w-1;
      }
      if (limit == -1)
          limit = stm->num;
      *end = limit;
  }
  
  void dump_STM_words(STM *stm,int s, int e, char *file){
      int i;
      FILE *fp = fopen(file,"w");
      TEXT *ctext, *tbuf;
      int tbuf_len=100;
  
      if (fp == NULL){
  	fprintf(stderr,"Error: Can't open STM words file '%s'
  ",file);
  	exit(1);
      }
      alloc_singZ(tbuf,tbuf_len,TEXT,'\0');
  
      for (i=s; i<=e; i++){
  	/* Write each individual word to a file */
  	if (TEXT_strlen(stm->seg[i].text) > tbuf_len+1){
  	    free_singarr(tbuf,TEXT);
  	    tbuf = TEXT_strdup(stm->seg[i].text);
  	    tbuf_len = TEXT_strlen(tbuf);
  	} else
  	    TEXT_strcpy(tbuf,stm->seg[i].text);
  
  	ctext = tokenize_TEXT_first_alt(tbuf,(TEXT *)" \t
  ");
  	while (ctext != NULL) {
  	    fprintf(fp,"%s
  ",ctext);
  	    ctext = tokenize_TEXT_first_alt(NULL,(TEXT *)" \t
  ");
  	}
      }
      fclose(fp);
      free_singarr(tbuf,TEXT);
  }
  
  void dump_STM(STM *stm, int s, int e){
      char *proc="dump_STM";
      int i;
  
      printf("%s: Range [%d,%d]  of  [%d,%d]
  ",proc,s,e,stm->s,stm->num);
      for (i=s; i<e; i++)
  	printf("%d: %s chan: %s  Spkr: %s  T1: %f  T2:%f  Text: '%s'
  ",i,
  	       stm->seg[i].file,stm->seg[i].chan,
  	       stm->seg[i].spkr,stm->seg[i].t1,
  	       stm->seg[i].t2,stm->seg[i].text);
      printf("
  ");
  }
  
  void convert_stm_to_word_list(char *file, char *words, int case_sense, int *num_ref){
      STM_SEG seg;
      FILE *fp, *fp_out;
      TEXT *buf, *ctext;
      int buf_len=100;
  
      if ((fp=fopen(file,"r")) == NULL){
  	fprintf(stderr,"Error: Unable to open input STM file '%s'
  ",file);
  	exit(1);
      }
  
      if ((fp_out=fopen(words,"w")) == NULL){
  	fprintf(stderr,"Error: Unable to open STM word file '%s'
  ",words);
  	exit(1);
      }
      (*num_ref) = 0;
      alloc_singZ(buf,buf_len,TEXT,'\0');
  
      /* for each stm, load the aligned text and create a path as before */
      while (!feof(fp)){
  	/* read the next ref line */
  	read_stm_line(&buf,&buf_len,fp);
  	if (feof(fp)) break;
  
  	/* parse the reference transcript */
  	parse_stm_line(&seg,&buf,&buf_len,case_sense,0);
  	
  	/* for each reference word, located it's matching diff output */
  	ctext = tokenize_TEXT_first_alt(seg.text,(TEXT *)" \t
  ");
  
  	while (ctext != NULL){
  	    fprintf(fp_out,"%s
  ",ctext);
  	    ctext = tokenize_TEXT_first_alt(NULL,(TEXT *)" \t
  ");
  	}
  	(*num_ref) ++;	
  	free_STM_SEG(&seg);
      }
      fclose(fp);
      fclose(fp_out);
      free_singarr(buf,TEXT);
  }
  
  
  /****************************************************************/
  /*  File an stm structure, making sure all records have been    */
  /*  loaded for each file in the designator.  If not, additional */
  /*  records are read in.                                        */
  /****************************************************************/
  void fill_STM_structure(STM *stm, FILE *fp_stm, char *stm_file, int *stm_file_end, int case_sense){ 
      static int stm_eof=0, stm_err=0;
      int just_read;
      
      if (stm == (STM *)0){
  	/* Reset the static variables */
          stm_eof = stm_err = 0;
  	return;
      }
      
      just_read = 0;
      do {
  	locate_STM_boundary(stm, stm->s, 1, 0, stm_file_end);
  	if ((*stm_file_end == stm->num) && !stm_eof){
  	    if (just_read == 1){
  		just_read = 0;
  		expand_STM(stm);
  	    }
  	    fill_STM(fp_stm, stm, stm_file, &stm_eof, case_sense,&stm_err);
  	    if (stm_err != 0){
  		fprintf(stdout,"; *Err: Error detected in STM file '%s'
  ",
  			stm_file);
  		exit(1);
  	    }
  	    just_read = 1;
  	} else 
  	    just_read = 0;
      } while (just_read);
  }