token.h 6 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179


/*
 * ASCLITE
 * Author: Jerome Ajot, Jon Fiscus, Nicolas Radde, Chris Laprun
 *
 * This software was developed at the National Institute of Standards and Technology by 
 * employees of the Federal Government in the course of their official duties. Pursuant
 * to title 17 Section 105 of the United States Code this software is not subject to
 * copyright protection and is in the public domain. ASCLITE is an experimental system.
 * NIST assumes no responsibility whatsoever for its use by other parties, and makes no
 * guarantees, expressed or implied, about its quality, reliability, or any other
 * characteristic. We would appreciate acknowledgement if the software is used.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS."  With regard to this software, NIST MAKES NO EXPRESS
 * OR IMPLIED WARRANTY AS TO ANY MATTER WHATSOEVER, INCLUDING MERCHANTABILITY,
 * OR FITNESS FOR A PARTICULAR PURPOSE.
 */
 
#ifndef TOKEN_H
#define TOKEN_H

#include "timedobject.h"
#include "properties.h"
#include "stdarg.h"
#include "logger.h"

class Segment;

/**
 * Internal representation of a token.
 * A token represent every informations needed on a word to align it.
 */
class Token : public TimedObject
{
	public:
		static const int NOT_FRAGMENT = 0;
		static const int BEGIN_FRAGMENT = 1;
		static const int END_FRAGMENT = 2;
		
		// class destructor
		~Token();
		
		void SetSourceText(const string& x); // sets the value of text
		/** Retrieves the raw text for this Token. */
		string GetSourceText() { return sourceText; }
		
		string GetText(); // returns the cleaned-up version of the text, used for alignment purposes.
		string GetTextInLowerCase(); // Return the text in lower case FIX-ME: REMOVE?
		void SetConfidence(const float& x); // sets the value of confidence
		float GetConfidence() { return confidence; } // returns the value of confidence
        Token* GetNextToken(const size_t& i) { return next[i]; } //retrieve the next token indexed
		void AddNextToken(Token* token) { next.push_back(token); } //add a "next" token indexed
		Token* GetPrecToken(const size_t& i) { return prec[i]; } //retrieve the next token indexed
		void AddPrecToken(Token* token) { prec.push_back(token); } //add a "prec" token indexed     
		
		static Token* CreateWithDuration(const int& _startTime, const int& _duration, Segment* parent);
		static Token* CreateWithDuration(const int& _startTime, const int& _duration, Segment* parent, const string& _text);
		static Token* CreateWithEndTime(const int& _startTime, const int& _endTime, Segment* parent);
		static Token* CreateWithEndTime(const int& _startTime, const int& _endTime, Segment* parent, const string& _text);
		static Token* CreateWithEndTime(const int& _startTime, const int& _endTime, Segment* parent, const string& _text, Token* first_prec_tokens, ...);
		
		void LinkTokens(Token *nextToken);    // Links the two tokens together
		
		/** Breaks the link between tokens **/
		void UnlinkTokens(Token *nextToken);

		/** Breaks the next token link between tokens **/
		void UnlinkNextToken(Token *nextToken);
		void UnlinkPrevToken(Token *prevToken);
		
		/**
		 * Return the number of next Tokens.
		 */
		size_t GetNbOfNextTokens() { return next.size(); }
		/**
		 * Return the number of prec Tokens.
		 */
		size_t GetNbOfPrecTokens() { return prec.size(); }
		/**
		 * Return true if the two token are equivalent in a 
		 * Speech recognition way.
		 */
		bool IsEquivalentTo(Token* token);
		/**
		 * Return if the Token is Optionnaly Deletable/Insertable.
		 */
		bool IsOptional();
		/**
         * Return true IF the confidence value was set
         */
		bool IsConfidenceSet() { return hasConfidence; }
        /**
         * Returns if the token is a Fragment:
		 * - Token::NOT_FRAGMENT if the token is not a fragment.
		 * - Token::BEGIN_FRAGMENT if the token is a beginning fragment => frag-
		 * - Token::END_FRAGMENT if the token is an ending fragment => -ment
         */
        int GetFragmentStatus();
   
		/**
		 * Return if two tokens are equals
		 */
		bool Equals(Token* token);
		
		int EditDistance(Token* token);
		
		/** Returns a string representation of this Token. */
		string ToString();
		
		/** Retrieves the Segment in which this Token is located */
		Segment* GetParentSegment() { return segment; }
		
		/** Output the string information for csv */
		string GetCSVInformation();
		
		void BecomeOptionallyDeletable() { SetSourceText(Token::BEGIN_OPTIONAL_MARKER + GetSourceText() + Token::END_OPTIONAL_MARKER); }
		
	protected:
		// class constructor
		Token();
		
	private:
		/**
		 * Raw text associated with this Token. We distinguish between source text as
		 * found in input source files, which might contain metadata, and cleaned-up
		 * text, which is the actual text of the Token (excluding metadata).
		 */
		string sourceText;
		/** Index position in sourceText of the cleaned-up text. */
		short start;
		/** Size (in characters) of the cleaned-up text. */
		short size;
		
		/** Updates cleaned-up text if needed. */
		void UpdateCleanedUpTextIfNeeded(const bool& force);
	
		/**
		 * The confidence score of the token.
		 * The confidence score is a number between 0 and 1 
		 * which represent the guessed accuracy of the token text.
		 */
		float confidence;
		/**
		 * True if the confidence was set
		 */
		bool hasConfidence;
        /**
		 * Store if the Token is Optionnaly Deletable/Insertable.
		 */
		bool optional;
		/**
		 * Store if the Token is fragment.
		 * - Token::NOT_FRAGMENT if the token is not a fragment
		 * - Token::BEGIN_FRAGMENT if the token is a beginning fragment => frie-
		 * - Token::END_FRAGMENT if the token is an ending fragment => -ing
		 */
		int fragment;
		/**
		 * Precedent Tokens on the graph
		 */
        vector<Token*> prec;
        /**
		 * Next Tokens on the graph
		 */
        vector<Token*> next;
        /**
         * Parent Segment.
         */
        Segment* segment;
        /**
         * log
         */
        static Logger* logger;
		
		static const char FRAGMENT_MARKER;
		static const char BEGIN_OPTIONAL_MARKER;
		static const char END_OPTIONAL_MARKER;
};

#endif // TOKEN_H