// lm/mikolov-rnnlm-lib.cc // Copyright 2015 Guoguo Chen Hainan Xu // 2010-2012 Tomas Mikolov // See ../../COPYING for clarification regarding multiple authors // // This file is based on version 0.3e of the RNNLM language modeling // toolkit by Tomas Mikolov. Changes made by authors other than // Tomas Mikolov are licensed under the Apache License, the short form // os which is below. The original code by Tomas Mikolov is licensed // under the BSD 3-clause license, whose text is further below. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. // // // Original BSD 3-clause license text: // Copyright (c) 2010-2012 Tomas Mikolov // // All rights reserved. Redistribution and use in source and binary forms, with // or without modification, are permitted provided that the following conditions // are met: 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following // disclaimer. 2. Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the // distribution. 3. Neither name of copyright holders nor the names of its // contributors may be used to endorse or promote products derived from this // software without specific prior written permission. THIS SOFTWARE IS PROVIDED // BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO // EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, // OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, // EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include #include #include #include "lm/mikolov-rnnlm-lib.h" #include "util/table-types.h" namespace rnnlm { ///// fast exp() implementation static union { double d; struct { int j, i; } n; } d2i; #define EXP_A (1048576 / M_LN2) #define EXP_C 60801 #define FAST_EXP(y) (d2i.n.i = EXP_A * (y) + (1072693248 - EXP_C), d2i.d) CRnnLM::CRnnLM() { version = 10; filetype = TEXT; use_lmprob = 0; gradient_cutoff = 15; dynamic = 0; train_file[0] = 0; valid_file[0] = 0; test_file[0] = 0; rnnlm_file[0] = 0; alpha_set = 0; train_file_set = 0; alpha = 0.1; beta = 0.0000001; // beta = 0.00000; alpha_divide = 0; logp = 0; llogp = -100000000; iter = 0; min_improvement = 1.003; train_words = 0; vocab_max_size = 100; vocab_size = 0; vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); layer1_size = 30; direct_size = 0; direct_order = 0; bptt = 0; bptt_block = 10; bptt_history = NULL; bptt_hidden = NULL; bptt_syn0 = NULL; gen = 0; independent = 0; neu0 = NULL; neu1 = NULL; neuc = NULL; neu2 = NULL; syn0 = NULL; syn1 = NULL; sync = NULL; syn_d = NULL; syn_db = NULL; // backup neu0b = NULL; neu1b = NULL; neucb = NULL; neu2b = NULL; neu1b2 = NULL; syn0b = NULL; syn1b = NULL; syncb = NULL; rand_seed = 1; class_size = 100; old_classes = 0; srand(rand_seed); vocab_hash_size = 100000000; vocab_hash = reinterpret_cast(calloc(vocab_hash_size, sizeof(int))); } CRnnLM::~CRnnLM() { int i; if (neu0 != NULL) { free(neu0); free(neu1); if (neuc != NULL) free(neuc); free(neu2); free(syn0); free(syn1); if (sync != NULL) free(sync); if (syn_d != NULL) free(syn_d); if (syn_db != NULL) free(syn_db); free(neu0b); free(neu1b); if (neucb != NULL) free(neucb); free(neu2b); free(neu1b2); free(syn0b); free(syn1b); if (syncb != NULL) free(syncb); for (i = 0; i < class_size; i++) { free(class_words[i]); } free(class_max_cn); free(class_cn); free(class_words); free(vocab); free(vocab_hash); if (bptt_history != NULL) free(bptt_history); if (bptt_hidden != NULL) free(bptt_hidden); if (bptt_syn0 != NULL) free(bptt_syn0); // todo: free bptt variables too } } real CRnnLM::random(real min, real max) { return rand() / (real)RAND_MAX * (max - min) + min; } void CRnnLM::setRnnLMFile(const std::string &str) { strcpy(rnnlm_file, str.c_str()); } void CRnnLM::setRandSeed(int newSeed) { rand_seed = newSeed; srand(rand_seed); } void CRnnLM::readWord(char *word, FILE *fin) { int a = 0, ch; while (!feof(fin)) { ch = fgetc(fin); if (ch == 13) continue; if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { if (a > 0) { if (ch == '\n') ungetc(ch, fin); break; } if (ch == '\n') { strcpy(word, const_cast("")); return; } else { continue; } } word[a] = ch; a++; if (a >= MAX_STRING) { // printf("Too long word found!\n"); //truncate too long words a--; } } word[a] = 0; } int CRnnLM::getWordHash(const char *word) { unsigned int hash, a; hash = 0; for (a = 0; a < strlen(word); a++) { hash = hash * 237 + word[a]; } hash = hash % vocab_hash_size; return hash; } int CRnnLM::searchVocab(const char *word) { int a; unsigned int hash; hash = getWordHash(word); if (vocab_hash[hash] == -1) return -1; if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; for (a = 0; a < vocab_size; a++) { // search in vocabulary if (!strcmp(word, vocab[a].word)) { vocab_hash[hash] = a; return a; } } return -1; // return OOV if not found } void CRnnLM::sortVocab() { int a, b, max; vocab_word swap; for (a = 1; a < vocab_size; a++) { max = a; for (b = a + 1; b < vocab_size; b++) { if (vocab[max].cn < vocab[b].cn) max = b; } swap = vocab[max]; vocab[max] = vocab[a]; vocab[a] = swap; } } void CRnnLM::saveWeights() { // saves current weights and unit activations int a, b; for (a = 0; a < layer0_size; a++) { neu0b[a].ac = neu0[a].ac; neu0b[a].er = neu0[a].er; } for (a = 0; a < layer1_size; a++) { neu1b[a].ac = neu1[a].ac; neu1b[a].er = neu1[a].er; } for (a = 0; a < layerc_size; a++) { neucb[a].ac = neuc[a].ac; neucb[a].er = neuc[a].er; } for (a = 0; a < layer2_size; a++) { neu2b[a].ac = neu2[a].ac; neu2b[a].er = neu2[a].er; } for (b = 0; b < layer1_size; b++) { for (a = 0; a < layer0_size; a++) { syn0b[a + b * layer0_size].weight = syn0[a + b * layer0_size].weight; } } if (layerc_size > 0) { for (b = 0; b < layerc_size; b++) { for (a = 0; a < layer1_size; a++) { syn1b[a + b * layer1_size].weight = syn1[a + b * layer1_size].weight; } } for (b = 0; b < layer2_size; b++) { for (a = 0; a < layerc_size; a++) { syncb[a + b * layerc_size].weight = sync[a + b * layerc_size].weight; } } } else { for (b = 0; b < layer2_size; b++) { for (a = 0; a < layer1_size; a++) { syn1b[a + b * layer1_size].weight = syn1[a + b * layer1_size].weight; } } } // for (a = 0; a < direct_size; a++) syn_db[a].weight = syn_d[a].weight; } void CRnnLM::initNet() { int a, b, cl; layer0_size = vocab_size + layer1_size; layer2_size = vocab_size + class_size; neu0 = (struct neuron *)calloc(layer0_size, sizeof(struct neuron)); neu1 = (struct neuron *)calloc(layer1_size, sizeof(struct neuron)); neuc = (struct neuron *)calloc(layerc_size, sizeof(struct neuron)); neu2 = (struct neuron *)calloc(layer2_size, sizeof(struct neuron)); syn0 = (struct synapse *)calloc(layer0_size * layer1_size, sizeof(struct synapse)); if (layerc_size == 0) { syn1 = (struct synapse *)calloc(layer1_size * layer2_size, sizeof(struct synapse)); } else { syn1 = (struct synapse *)calloc(layer1_size * layerc_size, sizeof(struct synapse)); sync = (struct synapse *)calloc(layerc_size * layer2_size, sizeof(struct synapse)); } if (syn1 == NULL) { printf("Memory allocation failed\n"); exit(1); } if (layerc_size > 0) if (sync == NULL) { printf("Memory allocation failed\n"); exit(1); } syn_d = reinterpret_cast(calloc(static_cast(direct_size), sizeof(direct_t))); if (syn_d == NULL) { printf("Memory allocation for direct" " connections failed (requested %lld bytes)\n", static_cast(direct_size) * static_cast(sizeof(direct_t))); exit(1); } neu0b = (struct neuron *)calloc(layer0_size, sizeof(struct neuron)); neu1b = (struct neuron *)calloc(layer1_size, sizeof(struct neuron)); neucb = (struct neuron *)calloc(layerc_size, sizeof(struct neuron)); neu1b2 = (struct neuron *)calloc(layer1_size, sizeof(struct neuron)); neu2b = (struct neuron *)calloc(layer2_size, sizeof(struct neuron)); syn0b = (struct synapse *)calloc(layer0_size * layer1_size, sizeof(struct synapse)); // syn1b = (struct synapse *)calloc(layer1_size*layer2_size, // sizeof(struct synapse)); if (layerc_size == 0) { syn1b = (struct synapse *)calloc(layer1_size * layer2_size, sizeof(struct synapse)); } else { syn1b = (struct synapse *)calloc(layer1_size * layerc_size, sizeof(struct synapse)); syncb = (struct synapse *)calloc(layerc_size * layer2_size, sizeof(struct synapse)); } if (syn1b == NULL) { printf("Memory allocation failed\n"); exit(1); } for (a = 0; a < layer0_size; a++) { neu0[a].ac = 0; neu0[a].er = 0; } for (a = 0; a < layer1_size; a++) { neu1[a].ac = 0; neu1[a].er = 0; } for (a = 0; a < layerc_size; a++) { neuc[a].ac = 0; neuc[a].er = 0; } for (a = 0; a < layer2_size; a++) { neu2[a].ac = 0; neu2[a].er = 0; } for (b = 0; b < layer1_size; b++) { for (a = 0; a < layer0_size; a++) { syn0[a + b * layer0_size].weight = random(-0.1, 0.1) + random(-0.1, 0.1) + random(-0.1, 0.1); } } if (layerc_size > 0) { for (b = 0; b < layerc_size; b++) { for (a = 0; a < layer1_size; a++) { syn1[a + b * layer1_size].weight = random(-0.1, 0.1) + random(-0.1, 0.1) + random(-0.1, 0.1); } } for (b = 0; b < layer2_size; b++) { for (a = 0; a < layerc_size; a++) { sync[a + b * layerc_size].weight = random(-0.1, 0.1) + random(-0.1, 0.1) + random(-0.1, 0.1); } } } else { for (b = 0; b < layer2_size; b++) { for (a = 0; a < layer1_size; a++) { syn1[a + b * layer1_size].weight = random(-0.1, 0.1) + random(-0.1, 0.1) + random(-0.1, 0.1); } } } long long aa; for (aa = 0; aa < direct_size; aa++) { syn_d[aa] = 0; } if (bptt > 0) { bptt_history = reinterpret_cast(calloc((bptt + bptt_block + 10), sizeof(int))); for (a = 0; a < bptt + bptt_block; a++) { bptt_history[a] = -1; } bptt_hidden = reinterpret_cast(calloc( (bptt + bptt_block + 1) * layer1_size, sizeof(neuron))); for (a = 0; a < (bptt + bptt_block) * layer1_size; a++) { bptt_hidden[a].ac = 0; bptt_hidden[a].er = 0; } bptt_syn0 = (struct synapse *)calloc(layer0_size * layer1_size, sizeof(struct synapse)); if (bptt_syn0 == NULL) { printf("Memory allocation failed\n"); exit(1); } } saveWeights(); double df, dd; int i; df = 0; dd = 0; a = 0; b = 0; if (old_classes) { // old classes for (i = 0; i < vocab_size; i++) { b += vocab[i].cn; } for (i = 0; i < vocab_size; i++) { df += vocab[i].cn / static_cast(b); if (df > 1) df = 1; if (df > (a + 1) / static_cast(class_size)) { vocab[i].class_index = a; if (a < class_size - 1) a++; } else { vocab[i].class_index = a; } } } else { // new classes for (i = 0; i < vocab_size; i++) { b += vocab[i].cn; } for (i = 0; i < vocab_size; i++) { dd += sqrt(vocab[i].cn / static_cast(b)); } for (i = 0; i < vocab_size; i++) { df += sqrt(vocab[i].cn / static_cast(b)) / dd; if (df > 1) df = 1; if (df > (a + 1) / static_cast(class_size)) { vocab[i].class_index = a; if (a < class_size - 1) a++; } else { vocab[i].class_index = a; } } } // allocate auxiliary class variables (for faster search when // normalizing probability at output layer) class_words = reinterpret_cast(calloc(class_size, sizeof(int *))); class_cn = reinterpret_cast(calloc(class_size, sizeof(int))); class_max_cn = reinterpret_cast(calloc(class_size, sizeof(int))); for (i = 0; i < class_size; i++) { class_cn[i] = 0; class_max_cn[i] = 10; class_words[i] = reinterpret_cast(calloc(class_max_cn[i], sizeof(int))); } for (i = 0; i < vocab_size; i++) { cl = vocab[i].class_index; class_words[cl][class_cn[cl]] = i; class_cn[cl]++; if (class_cn[cl] + 2 >= class_max_cn[cl]) { class_max_cn[cl] += 10; class_words[cl] = reinterpret_cast(realloc(class_words[cl], class_max_cn[cl] * sizeof(int))); } } } void CRnnLM::goToDelimiter(int delim, FILE *fi) { int ch = 0; while (ch != delim) { ch = fgetc(fi); if (feof(fi)) { printf("Unexpected end of file\n"); exit(1); } } } void CRnnLM::restoreNet() { // will read whole network structure FILE *fi; int a, b, ver, unused_size; float fl; char str[MAX_STRING]; double d; fi = fopen(rnnlm_file, "rb"); if (fi == NULL) { printf("ERROR: model file '%s' not found!\n", rnnlm_file); exit(1); } goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &ver); if ((ver == 4) && (version == 5)) { /* we will solve this later.. */ } else { if (ver != version) { printf("Unknown version of file %s\n", rnnlm_file); exit(1); } } goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &filetype); goToDelimiter(':', fi); if (train_file_set == 0) { unused_size = fscanf(fi, "%s", train_file); } else { unused_size = fscanf(fi, "%s", str); } goToDelimiter(':', fi); unused_size = fscanf(fi, "%s", valid_file); goToDelimiter(':', fi); unused_size = fscanf(fi, "%lf", &llogp); goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &iter); goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &train_cur_pos); goToDelimiter(':', fi); unused_size = fscanf(fi, "%lf", &logp); goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &anti_k); goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &train_words); goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &layer0_size); goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &layer1_size); goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &layerc_size); goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &layer2_size); if (ver > 5) { goToDelimiter(':', fi); unused_size = fscanf(fi, "%lld", &direct_size); } if (ver > 6) { goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &direct_order); } goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &bptt); if (ver > 4) { goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &bptt_block); } else { bptt_block = 10; } goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &vocab_size); goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &class_size); goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &old_classes); goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &independent); goToDelimiter(':', fi); unused_size = fscanf(fi, "%lf", &d); starting_alpha = d; goToDelimiter(':', fi); if (alpha_set == 0) { unused_size = fscanf(fi, "%lf", &d); alpha = d; } else { unused_size = fscanf(fi, "%lf", &d); } goToDelimiter(':', fi); unused_size = fscanf(fi, "%d", &alpha_divide); // read normal vocabulary if (vocab_max_size < vocab_size) { if (vocab != NULL) free(vocab); vocab_max_size = vocab_size + 1000; // initialize memory for vocabulary vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); } goToDelimiter(':', fi); for (a = 0; a < vocab_size; a++) { // unused_size = fscanf(fi, "%d%d%s%d", &b, &vocab[a].cn, // vocab[a].word, &vocab[a].class_index); unused_size = fscanf(fi, "%d%d", &b, &vocab[a].cn); readWord(vocab[a].word, fi); unused_size = fscanf(fi, "%d", &vocab[a].class_index); // printf("%d %d %s %d\n", b, vocab[a].cn, // vocab[a].word, vocab[a].class_index); } if (neu0 == NULL) initNet(); // memory allocation here if (filetype == TEXT) { goToDelimiter(':', fi); for (a = 0; a < layer1_size; a++) { unused_size = fscanf(fi, "%lf", &d); neu1[a].ac = d; } } if (filetype == BINARY) { fgetc(fi); for (a = 0; a < layer1_size; a++) { unused_size = fread(&fl, 4, 1, fi); neu1[a].ac = fl; } } if (filetype == TEXT) { goToDelimiter(':', fi); for (b = 0; b < layer1_size; b++) { for (a = 0; a < layer0_size; a++) { unused_size = fscanf(fi, "%lf", &d); syn0[a + b * layer0_size].weight = d; } } } if (filetype == BINARY) { for (b = 0; b < layer1_size; b++) { for (a = 0; a < layer0_size; a++) { unused_size = fread(&fl, 4, 1, fi); syn0[a + b * layer0_size].weight = fl; } } } if (filetype == TEXT) { goToDelimiter(':', fi); if (layerc_size == 0) { // no compress layer for (b = 0; b < layer2_size; b++) { for (a = 0; a < layer1_size; a++) { unused_size = fscanf(fi, "%lf", &d); syn1[a + b * layer1_size].weight = d; } } } else { // with compress layer for (b = 0; b < layerc_size; b++) { for (a = 0; a < layer1_size; a++) { unused_size = fscanf(fi, "%lf", &d); syn1[a + b * layer1_size].weight = d; } } goToDelimiter(':', fi); for (b = 0; b < layer2_size; b++) { for (a = 0; a < layerc_size; a++) { unused_size = fscanf(fi, "%lf", &d); sync[a + b * layerc_size].weight = d; } } } } if (filetype == BINARY) { if (layerc_size == 0) { // no compress layer for (b = 0; b < layer2_size; b++) { for (a = 0; a < layer1_size; a++) { unused_size = fread(&fl, 4, 1, fi); syn1[a + b * layer1_size].weight = fl; } } } else { // with compress layer for (b = 0; b < layerc_size; b++) { for (a = 0; a < layer1_size; a++) { unused_size = fread(&fl, 4, 1, fi); syn1[a + b * layer1_size].weight = fl; } } for (b = 0; b < layer2_size; b++) { for (a = 0; a < layerc_size; a++) { unused_size = fread(&fl, 4, 1, fi); sync[a + b * layerc_size].weight = fl; } } } } if (filetype == TEXT) { goToDelimiter(':', fi); // direct conenctions long long aa; for (aa = 0; aa < direct_size; aa++) { unused_size = fscanf(fi, "%lf", &d); syn_d[aa] = d; } } if (filetype == BINARY) { long long aa; for (aa = 0; aa < direct_size; aa++) { unused_size = fread(&fl, 4, 1, fi); syn_d[aa] = fl; /*unused_size = fread(&si, 2, 1, fi); fl = si/(float)(4*256); syn_d[aa] = fl;*/ } } saveWeights(); // idiom to "use" an unused variable (void) unused_size; fclose(fi); } void CRnnLM::netReset() { // cleans hidden layer activation + bptt history int a, b; for (a = 0; a < layer1_size; a++) { neu1[a].ac = 1.0; } copyHiddenLayerToInput(); if (bptt > 0) { for (a = 1; a < bptt + bptt_block; a++) { bptt_history[a] = 0; } for (a = bptt + bptt_block - 1; a > 1; a--) { for (b = 0; b < layer1_size; b++) { bptt_hidden[a * layer1_size + b].ac = 0; bptt_hidden[a * layer1_size + b].er = 0; } } } for (a = 0; a < MAX_NGRAM_ORDER; a++) { history[a] = 0; } } void CRnnLM::matrixXvector(struct neuron *dest, struct neuron *srcvec, struct synapse *srcmatrix, int matrix_width, int from, int to, int from2, int to2, int type) { int a, b; real val1, val2, val3, val4; real val5, val6, val7, val8; if (type == 0) { // ac mod for (b = 0; b < (to - from) / 8; b++) { val1 = 0; val2 = 0; val3 = 0; val4 = 0; val5 = 0; val6 = 0; val7 = 0; val8 = 0; for (a = from2; a < to2; a++) { val1 += srcvec[a].ac * srcmatrix[a + (b * 8 + from + 0) * matrix_width].weight; val2 += srcvec[a].ac * srcmatrix[a + (b * 8 + from + 1) * matrix_width].weight; val3 += srcvec[a].ac * srcmatrix[a + (b * 8 + from + 2) * matrix_width].weight; val4 += srcvec[a].ac * srcmatrix[a + (b * 8 + from + 3) * matrix_width].weight; val5 += srcvec[a].ac * srcmatrix[a + (b * 8 + from + 4) * matrix_width].weight; val6 += srcvec[a].ac * srcmatrix[a + (b * 8 + from + 5) * matrix_width].weight; val7 += srcvec[a].ac * srcmatrix[a + (b * 8 + from + 6) * matrix_width].weight; val8 += srcvec[a].ac * srcmatrix[a + (b * 8 + from + 7) * matrix_width].weight; } dest[b * 8 + from + 0].ac += val1; dest[b * 8 + from + 1].ac += val2; dest[b * 8 + from + 2].ac += val3; dest[b * 8 + from + 3].ac += val4; dest[b * 8 + from + 4].ac += val5; dest[b * 8 + from + 5].ac += val6; dest[b * 8 + from + 6].ac += val7; dest[b * 8 + from + 7].ac += val8; } for (b = b * 8; b < to - from; b++) { for (a = from2; a < to2; a++) { dest[b+from].ac += srcvec[a].ac * srcmatrix[a + (b + from) * matrix_width].weight; } } } else { // er mod for (a = 0; a < (to2 - from2) / 8; a++) { val1 = 0; val2 = 0; val3 = 0; val4 = 0; val5 = 0; val6 = 0; val7 = 0; val8 = 0; for (b = from; b < to; b++) { val1 += srcvec[b].er * srcmatrix[a * 8 + from2 + 0 + b * matrix_width].weight; val2 += srcvec[b].er * srcmatrix[a * 8 + from2 + 1 + b * matrix_width].weight; val3 += srcvec[b].er * srcmatrix[a * 8 + from2 + 2 + b * matrix_width].weight; val4 += srcvec[b].er * srcmatrix[a * 8 + from2 + 3 + b * matrix_width].weight; val5 += srcvec[b].er * srcmatrix[a * 8 + from2 + 4 + b * matrix_width].weight; val6 += srcvec[b].er * srcmatrix[a * 8 + from2 + 5 + b * matrix_width].weight; val7 += srcvec[b].er * srcmatrix[a * 8 + from2 + 6 + b * matrix_width].weight; val8 += srcvec[b].er * srcmatrix[a * 8 + from2 + 7 + b * matrix_width].weight; } dest[a * 8 + from2 + 0].er += val1; dest[a * 8 + from2 + 1].er += val2; dest[a * 8 + from2 + 2].er += val3; dest[a * 8 + from2 + 3].er += val4; dest[a * 8 + from2 + 4].er += val5; dest[a * 8 + from2 + 5].er += val6; dest[a * 8 + from2 + 6].er += val7; dest[a * 8 + from2 + 7].er += val8; } for (a = a * 8; a < to2 - from2; a++) { for (b = from; b < to; b++) { dest[a + from2].er += srcvec[b].er * srcmatrix[a + from2 + b * matrix_width].weight; } } if (gradient_cutoff > 0) for (a = from2; a < to2; a++) { if (dest[a].er > gradient_cutoff) dest[a].er = gradient_cutoff; if (dest[a].er < -gradient_cutoff) dest[a].er = -gradient_cutoff; } } // this is normal implementation (about 3x slower): /*if (type == 0) { //ac mod for (b = from; b < to; b++) { for (a = from2; a < to2; a++) { dest[b].ac += srcvec[a].ac * srcmatrix[a+b*matrix_width].weight; } } } else //er mod if (type == 1) { for (a = from2; a < to2; a++) { for (b = from; b < to; b++) { dest[a].er += srcvec[b].er * srcmatrix[a+b*matrix_width].weight; } } }*/ } void CRnnLM::computeNet(int last_word, int word) { int a, b, c; real val; double sum; // sum is used for normalization: it's better to have larger // precision as many numbers are summed together here if (last_word != -1) neu0[last_word].ac = 1; // propagate 0->1 for (a = 0; a < layer1_size; a++) { neu1[a].ac = 0; } for (a = 0; a < layerc_size; a++) { neuc[a].ac = 0; } matrixXvector(neu1, neu0, syn0, layer0_size, 0, layer1_size, layer0_size - layer1_size, layer0_size, 0); for (b = 0; b < layer1_size; b++) { a = last_word; if (a != -1) neu1[b].ac += neu0[a].ac * syn0[a + b * layer0_size].weight; } // activate 1 --sigmoid for (a = 0; a < layer1_size; a++) { if (neu1[a].ac > 50) neu1[a].ac = 50; // for numerical stability if (neu1[a].ac < -50) neu1[a].ac = -50; // for numerical stability val = -neu1[a].ac; neu1[a].ac = 1 / (1 + FAST_EXP(val)); } if (layerc_size > 0) { matrixXvector(neuc, neu1, syn1, layer1_size, 0, layerc_size, 0, layer1_size, 0); // activate compression --sigmoid for (a = 0; a < layerc_size; a++) { if (neuc[a].ac > 50) neuc[a].ac = 50; // for numerical stability if (neuc[a].ac < -50) neuc[a].ac = -50; // for numerical stability val = -neuc[a].ac; neuc[a].ac = 1 / (1 + FAST_EXP(val)); } } // 1->2 class for (b = vocab_size; b < layer2_size; b++) { neu2[b].ac = 0; } if (layerc_size > 0) { matrixXvector(neu2, neuc, sync, layerc_size, vocab_size, layer2_size, 0, layerc_size, 0); } else { matrixXvector(neu2, neu1, syn1, layer1_size, vocab_size, layer2_size, 0, layer1_size, 0); } // apply direct connections to classes if (direct_size > 0) { unsigned long long hash[MAX_NGRAM_ORDER]; // this will hold pointers to syn_d that contains hash parameters for (a = 0; a < direct_order; a++) { hash[a] = 0; } for (a = 0; a < direct_order; a++) { b = 0; if (a > 0) if (history[a - 1] == -1) break; // if OOV was in history, do not use this N-gram feature and higher orders hash[a] = PRIMES[0] * PRIMES[1]; for (b = 1; b <= a; b++) { hash[a] += PRIMES[(a * PRIMES[b] + b) % PRIMES_SIZE] * static_cast(history[b - 1] + 1); } // update hash value based on words from the history hash[a] = hash[a] % (direct_size / 2); // make sure that starting hash index is in the first // half of syn_d (second part is reserved for history->words features) } for (a = vocab_size; a < layer2_size; a++) { for (b = 0; b < direct_order; b++) { if (hash[b]) { neu2[a].ac += syn_d[hash[b]]; // apply current parameter and move to the next one hash[b]++; } else { break; } } } } // activation 2 --softmax on classes sum = 0; for (a = vocab_size; a < layer2_size; a++) { if (neu2[a].ac > 50) neu2[a].ac = 50; // for numerical stability if (neu2[a].ac < -50) neu2[a].ac = -50; // for numerical stability val = FAST_EXP(neu2[a].ac); sum+= val; neu2[a].ac = val; } for (a = vocab_size; a < layer2_size; a++) { neu2[a].ac /= sum; } // output layer activations now sum exactly to 1 if (gen > 0) return; // if we generate words, we don't know what current word // is -> only classes are estimated and word is selected // in testGen() // 1->2 word if (word != -1) { for (c = 0; c < class_cn[vocab[word].class_index]; c++) { neu2[class_words[vocab[word].class_index][c]].ac = 0; } if (layerc_size > 0) { matrixXvector(neu2, neuc, sync, layerc_size, class_words[vocab[word].class_index][0], class_words[vocab[word].class_index][0] + class_cn[vocab[word].class_index], 0, layerc_size, 0); } else { matrixXvector(neu2, neu1, syn1, layer1_size, class_words[vocab[word].class_index][0], class_words[vocab[word].class_index][0] + class_cn[vocab[word].class_index], 0, layer1_size, 0); } } // apply direct connections to words if (word != -1) if (direct_size > 0) { unsigned long long hash[MAX_NGRAM_ORDER]; for (a = 0; a < direct_order; a++) { hash[a] = 0; } for (a = 0; a < direct_order; a++) { b = 0; if (a > 0) if (history[a - 1] == -1) break; hash[a] = PRIMES[0] * PRIMES[1] * static_cast(vocab[word].class_index + 1); for (b = 1; b <= a; b++) { hash[a] += PRIMES[(a * PRIMES[b] + b) % PRIMES_SIZE] * static_cast(history[b - 1] + 1); } hash[a] = (hash[a] % (direct_size / 2)) + (direct_size) / 2; } for (c = 0; c < class_cn[vocab[word].class_index]; c++) { a = class_words[vocab[word].class_index][c]; for (b = 0; b < direct_order; b++) if (hash[b]) { neu2[a].ac += syn_d[hash[b]]; hash[b]++; hash[b] = hash[b] % direct_size; } else { break; } } } // activation 2 --softmax on words sum = 0; if (word != -1) { for (c = 0; c < class_cn[vocab[word].class_index]; c++) { a = class_words[vocab[word].class_index][c]; if (neu2[a].ac > 50) neu2[a].ac = 50; // for numerical stability if (neu2[a].ac < -50) neu2[a].ac = -50; // for numerical stability val = FAST_EXP(neu2[a].ac); sum+= val; neu2[a].ac = val; } for (c = 0; c < class_cn[vocab[word].class_index]; c++) { neu2[class_words[vocab[word].class_index][c]].ac /= sum; } } } void CRnnLM::copyHiddenLayerToInput() { int a; for (a = 0; a < layer1_size; a++) { neu0[a + layer0_size - layer1_size].ac = neu1[a].ac; } } void CRnnLM::restoreContextFromVector(const std::vector &context_in) { assert(context_in.size() == layer1_size); for (int i = 0; i < layer1_size; ++i) { neu1[i].ac = context_in[i]; } } void CRnnLM::saveContextToVector(std::vector *context_out) { assert(context_out != NULL); context_out->resize(layer1_size); for (int i = 0; i < layer1_size; ++i) { (*context_out)[i] = neu1[i].ac; } } float CRnnLM::computeConditionalLogprob( std::string current_word, const std::vector < std::string > &history_words, const std::vector < float > &context_in, std::vector < float > *context_out) { // We assume the network has been restored. netReset(); restoreContextFromVector(context_in); copyHiddenLayerToInput(); // Maps unk to the unk symbol. std::vector history_words_nounk(history_words); std::string current_word_nounk = current_word; if (isUnk(current_word_nounk)) { current_word_nounk = unk_sym; } for (int i = 0; i < history_words_nounk.size(); ++i) { if (isUnk(history_words_nounk[i])) { history_words_nounk[i] = unk_sym; } } // Handles history for n-gram features. for (int i = 0; i < MAX_NGRAM_ORDER; i++) { history[i] = 0; } for (int i = 0; i < history_words_nounk.size() && i < MAX_NGRAM_ORDER; i++) { history[i] = searchVocab( history_words_nounk[history_words_nounk.size() - 1 - i].c_str()); } int word = 0, last_word = 0; float logprob = 0; if (current_word_nounk == unk_sym) { logprob += getUnkPenalty(current_word); } word = searchVocab(current_word_nounk.c_str()); if (history_words_nounk.size() > 0) { last_word = searchVocab( history_words_nounk[history_words_nounk.size() - 1].c_str()); } computeNet(last_word, word); if (word != -1) { logprob += log(neu2[vocab[word].class_index + vocab_size].ac * neu2[word].ac); } else { logprob += -16.118; } if (context_out != NULL) { saveContextToVector(context_out); } if (last_word != -1) { neu0[last_word].ac = 0; } return logprob; } bool CRnnLM::isUnk(const std::string &word) { int word_int = searchVocab(word.c_str()); if (word_int == -1) return true; return false; } void CRnnLM::setUnkSym(const std::string &unk) { unk_sym = unk; } float CRnnLM::getUnkPenalty(const std::string &word) { unordered_map ::const_iterator iter = unk_penalty.find(word); if (iter != unk_penalty.end()) return iter->second; return -16.118; // Fixed penalty. } void CRnnLM::setUnkPenalty(const std::string &filename) { if (filename.empty()) return; kaldi::SequentialBaseFloatReader unk_reader(filename); for (; !unk_reader.Done(); unk_reader.Next()) { std::string key = unk_reader.Key(); float prob = unk_reader.Value(); unk_reader.FreeCurrent(); unk_penalty[key] = log(prob); } } } // namespace rnnlm