SphinxBase 0.6
src/libsphinxbase/lm/ngram_model_dmp.c
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 /*
00038  * \file ngram_model_dmp.c DMP format language models
00039  *
00040  * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
00041  */
00042 
00043 #include <assert.h>
00044 #include <stdio.h>
00045 #include <string.h>
00046 #include <stdlib.h>
00047 #include <limits.h>
00048 
00049 #include "sphinxbase/ckd_alloc.h"
00050 #include "sphinxbase/pio.h"
00051 #include "sphinxbase/err.h"
00052 #include "sphinxbase/byteorder.h"
00053 #include "sphinxbase/listelem_alloc.h"
00054 
00055 #include "ngram_model_dmp.h"
00056 
00057 static const char darpa_hdr[] = "Darpa Trigram LM";
00058 static ngram_funcs_t ngram_model_dmp_funcs;
00059 
00060 #define TSEG_BASE(m,b)          ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
00061 #define FIRST_BG(m,u)           ((m)->lm3g.unigrams[u].bigrams)
00062 #define FIRST_TG(m,b)           (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
00063 
00064 static unigram_t *
00065 new_unigram_table(int32 n_ug)
00066 {
00067     unigram_t *table;
00068     int32 i;
00069 
00070     table = ckd_calloc(n_ug, sizeof(unigram_t));
00071     for (i = 0; i < n_ug; i++) {
00072         table[i].prob1.f = -99.0;
00073         table[i].bo_wt1.f = -99.0;
00074     }
00075     return table;
00076 }
00077 
00078 ngram_model_t *
00079 ngram_model_dmp_read(cmd_ln_t *config,
00080                      const char *file_name,
00081                      logmath_t *lmath)
00082 {
00083     ngram_model_t *base;
00084     ngram_model_dmp_t *model;
00085     FILE *fp;
00086     int do_mmap, do_swap;
00087     int32 is_pipe;
00088     int32 i, j, k, vn, n, ts;
00089     int32 n_unigram;
00090     int32 n_bigram;
00091     int32 n_trigram;
00092     char str[1024];
00093     unigram_t *ugptr;
00094     bigram_t *bgptr;
00095     trigram_t *tgptr;
00096     char *tmp_word_str;
00097     char *map_base = NULL;
00098     size_t offset = 0, filesize;
00099 
00100     base = NULL;
00101     do_mmap = FALSE;
00102     if (config)
00103         do_mmap = cmd_ln_boolean_r(config, "-mmap");
00104 
00105     if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) {
00106         E_ERROR("Dump file %s not found\n", file_name);
00107         goto error_out;
00108     }
00109 
00110     if (is_pipe && do_mmap) {
00111         E_WARN("Dump file is compressed, will not use memory-mapped I/O\n");
00112         do_mmap = 0;
00113     }
00114 
00115     do_swap = FALSE;
00116     if (fread(&k, sizeof(k), 1, fp) != 1)
00117         goto error_out;
00118     if (k != strlen(darpa_hdr)+1) {
00119         SWAP_INT32(&k);
00120         if (k != strlen(darpa_hdr)+1) {
00121             E_ERROR("Wrong magic header size number %x: %s is not a dump file\n", k, file_name);
00122             goto error_out;
00123         }
00124         do_swap = 1;
00125     }
00126     if (fread(str, 1, k, fp) != (size_t) k) {
00127         E_ERROR("Cannot read header\n");
00128         goto error_out;
00129     }
00130     if (strncmp(str, darpa_hdr, k) != 0) {
00131         E_ERROR("Wrong header %s: %s is not a dump file\n", darpa_hdr);
00132         goto error_out;
00133     }
00134 
00135     if (do_mmap) {
00136         if (do_swap) {
00137             E_INFO
00138                 ("Byteswapping required, will not use memory-mapped I/O for LM file\n");
00139             do_mmap = 0;
00140         }
00141         else {
00142             E_INFO("Will use memory-mapped I/O for LM file\n");
00143 #ifdef __ADSPBLACKFIN__ /* This is true for both VisualDSP++ and uClinux. */
00144             E_FATAL("memory mapping is not supported at the moment.");
00145 #else
00146 #endif
00147         }
00148     }
00149 
00150     if (fread(&k, sizeof(k), 1, fp) != 1)
00151         goto error_out;
00152     if (do_swap) SWAP_INT32(&k);
00153     if (fread(str, 1, k, fp) != (size_t) k) {
00154         E_ERROR("Cannot read LM filename in header\n");
00155         goto error_out;
00156     }
00157 
00158     /* read version#, if present (must be <= 0) */
00159     if (fread(&vn, sizeof(vn), 1, fp) != 1)
00160         goto error_out;
00161     if (do_swap) SWAP_INT32(&vn);
00162     if (vn <= 0) {
00163         /* read and don't compare timestamps (we don't care) */
00164         if (fread(&ts, sizeof(ts), 1, fp) != 1)
00165             goto error_out;
00166         if (do_swap) SWAP_INT32(&ts);
00167 
00168         /* read and skip format description */
00169         for (;;) {
00170             if (fread(&k, sizeof(k), 1, fp) != 1)
00171                 goto error_out;
00172             if (do_swap) SWAP_INT32(&k);
00173             if (k == 0)
00174                 break;
00175             if (fread(str, 1, k, fp) != (size_t) k) {
00176                 E_ERROR("fread(word) failed\n");
00177                 goto error_out;
00178             }
00179         }
00180         /* read model->ucount */
00181         if (fread(&n_unigram, sizeof(n_unigram), 1, fp) != 1)
00182             goto error_out;
00183         if (do_swap) SWAP_INT32(&n_unigram);
00184     }
00185     else {
00186         n_unigram = vn;
00187     }
00188 
00189     /* read model->bcount, tcount */
00190     if (fread(&n_bigram, sizeof(n_bigram), 1, fp) != 1)
00191         goto error_out;
00192     if (do_swap) SWAP_INT32(&n_bigram);
00193     if (fread(&n_trigram, sizeof(n_trigram), 1, fp) != 1)
00194         goto error_out;
00195     if (do_swap) SWAP_INT32(&n_trigram);
00196     E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
00197 
00198     /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
00199     model = ckd_calloc(1, sizeof(*model));
00200     base = &model->base;
00201     if (n_trigram > 0)
00202         n = 3;
00203     else if (n_bigram > 0)
00204         n = 2;
00205     else
00206         n = 1;
00207     ngram_model_init(base, &ngram_model_dmp_funcs, lmath, n, n_unigram);
00208     base->n_counts[0] = n_unigram;
00209     base->n_counts[1] = n_bigram;
00210     base->n_counts[2] = n_trigram;
00211 
00212     /* read unigrams (always in memory, as they contain dictionary
00213      * mappings that can't be precomputed, and also could have OOVs added) */
00214     model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
00215     ugptr = model->lm3g.unigrams;
00216     for (i = 0; i <= n_unigram; ++i) {
00217         /* Skip over the mapping ID, we don't care about it. */
00218         if (fread(ugptr, sizeof(int32), 1, fp) != 1) {
00219             E_ERROR("fread(mapid[%d]) failed\n", i);
00220             goto error_out;
00221         }
00222         /* Read the actual unigram structure. */
00223         if (fread(ugptr, sizeof(unigram_t), 1, fp) != 1)  {
00224             E_ERROR("fread(unigrams) failed\n");
00225             ngram_model_free(base);
00226             fclose_comp(fp, is_pipe);
00227             return NULL;
00228         }
00229         /* Byte swap if necessary. */
00230         if (do_swap) {
00231             SWAP_INT32(&ugptr->prob1.l);
00232             SWAP_INT32(&ugptr->bo_wt1.l);
00233             SWAP_INT32(&ugptr->bigrams);
00234         }
00235         /* Convert values to log. */
00236         ugptr->prob1.l = logmath_log10_to_log(lmath, ugptr->prob1.f);
00237         ugptr->bo_wt1.l = logmath_log10_to_log(lmath, ugptr->bo_wt1.f);
00238         E_DEBUG(2, ("ug %d: prob %d bo %d bigrams %d\n",
00239                     i, ugptr->prob1.l, ugptr->bo_wt1.l, ugptr->bigrams));
00240         ++ugptr;
00241     }
00242     E_INFO("%8d = LM.unigrams(+trailer) read\n", n_unigram);
00243 
00244     /* Now mmap() the file and read in the rest of the (read-only) stuff. */
00245     if (do_mmap) {
00246         offset = ftell(fp);
00247         fseek(fp, 0, SEEK_END);
00248         filesize = ftell(fp);
00249         fseek(fp, offset, SEEK_SET);
00250 
00251         /* Check for improper word alignment. */
00252         if (offset & 0x3) {
00253             E_WARN("-mmap specified, but tseg_base is not word-aligned.  Will not memory-map.\n");
00254             do_mmap = FALSE;
00255         }
00256         else {
00257             model->dump_mmap = mmio_file_read(file_name);
00258             if (model->dump_mmap == NULL) {
00259                 do_mmap = FALSE;
00260             }
00261             else {
00262                 map_base = mmio_file_ptr(model->dump_mmap);
00263             }
00264         }
00265     }
00266     
00267     if (n_bigram > 0) {
00268         /* read bigrams */
00269         if (do_mmap) {
00270             model->lm3g.bigrams = (bigram_t *) (map_base + offset);
00271             offset += (n_bigram + 1) * sizeof(bigram_t);
00272         }
00273         else {
00274             model->lm3g.bigrams =
00275                 ckd_calloc(n_bigram + 1, sizeof(bigram_t));
00276             if (fread(model->lm3g.bigrams, sizeof(bigram_t), n_bigram + 1, fp)
00277                 != (size_t) n_bigram + 1) {
00278                 E_ERROR("fread(bigrams) failed\n");
00279                 goto error_out;
00280             }
00281             if (do_swap) {
00282                 for (i = 0, bgptr = model->lm3g.bigrams; i <= n_bigram;
00283                      i++, bgptr++) {
00284                     SWAP_INT16(&bgptr->wid);
00285                     SWAP_INT16(&bgptr->prob2);
00286                     SWAP_INT16(&bgptr->bo_wt2);
00287                     SWAP_INT16(&bgptr->trigrams);
00288                 }
00289             }
00290         }
00291         E_INFO("%8d = LM.bigrams(+trailer) read\n", n_bigram);
00292     }
00293 
00294     /* read trigrams */
00295     if (n_trigram > 0) {
00296         if (do_mmap) {
00297             model->lm3g.trigrams = (trigram_t *) (map_base + offset);
00298             offset += n_trigram * sizeof(trigram_t);
00299         }
00300         else {
00301             model->lm3g.trigrams =
00302                 ckd_calloc(n_trigram, sizeof(trigram_t));
00303             if (fread
00304                 (model->lm3g.trigrams, sizeof(trigram_t), n_trigram, fp)
00305                 != (size_t) n_trigram) {
00306                 E_ERROR("fread(trigrams) failed\n");
00307                 goto error_out;
00308             }
00309             if (do_swap) {
00310                 for (i = 0, tgptr = model->lm3g.trigrams; i < n_trigram;
00311                      i++, tgptr++) {
00312                     SWAP_INT16(&tgptr->wid);
00313                     SWAP_INT16(&tgptr->prob3);
00314                 }
00315             }
00316         }
00317         E_INFO("%8d = LM.trigrams read\n", n_trigram);
00318         /* Initialize tginfo */
00319         model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
00320         model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
00321     }
00322 
00323     if (n_bigram > 0) {
00324         /* read n_prob2 and prob2 array (in memory) */
00325         if (do_mmap)
00326             fseek(fp, offset, SEEK_SET);
00327         if (fread(&k, sizeof(k), 1, fp) != 1)
00328             goto error_out;
00329         if (do_swap) SWAP_INT32(&k);
00330         model->lm3g.n_prob2 = k;
00331         model->lm3g.prob2 = ckd_calloc(k, sizeof(*model->lm3g.prob2));
00332         if (fread(model->lm3g.prob2, sizeof(*model->lm3g.prob2), k, fp) != (size_t) k) {
00333             E_ERROR("fread(prob2) failed\n");
00334             goto error_out;
00335         }
00336         for (i = 0; i < k; i++) {
00337             if (do_swap)
00338                 SWAP_INT32(&model->lm3g.prob2[i].l);
00339             /* Convert values to log. */
00340             model->lm3g.prob2[i].l = logmath_log10_to_log(lmath, model->lm3g.prob2[i].f);
00341         }
00342         E_INFO("%8d = LM.prob2 entries read\n", k);
00343     }
00344 
00345     /* read n_bo_wt2 and bo_wt2 array (in memory) */
00346     if (base->n > 2) {
00347         if (fread(&k, sizeof(k), 1, fp) != 1)
00348             goto error_out;
00349         if (do_swap) SWAP_INT32(&k);
00350         model->lm3g.n_bo_wt2 = k;
00351         model->lm3g.bo_wt2 = ckd_calloc(k, sizeof(*model->lm3g.bo_wt2));
00352         if (fread(model->lm3g.bo_wt2, sizeof(*model->lm3g.bo_wt2), k, fp) != (size_t) k) {
00353             E_ERROR("fread(bo_wt2) failed\n");
00354             goto error_out;
00355         }
00356         for (i = 0; i < k; i++) {
00357             if (do_swap)
00358                 SWAP_INT32(&model->lm3g.bo_wt2[i].l);
00359             /* Convert values to log. */
00360             model->lm3g.bo_wt2[i].l = logmath_log10_to_log(lmath, model->lm3g.bo_wt2[i].f);
00361         }
00362         E_INFO("%8d = LM.bo_wt2 entries read\n", k);
00363     }
00364 
00365     /* read n_prob3 and prob3 array (in memory) */
00366     if (base->n > 2) {
00367         if (fread(&k, sizeof(k), 1, fp) != 1)
00368                 goto error_out;
00369         if (do_swap) SWAP_INT32(&k);
00370         model->lm3g.n_prob3 = k;
00371         model->lm3g.prob3 = ckd_calloc(k, sizeof(*model->lm3g.prob3));
00372         if (fread(model->lm3g.prob3, sizeof(*model->lm3g.prob3), k, fp) != (size_t) k) {
00373             E_ERROR("fread(prob3) failed\n");
00374             goto error_out;
00375         }
00376         for (i = 0; i < k; i++) {
00377             if (do_swap)
00378                 SWAP_INT32(&model->lm3g.prob3[i].l);
00379             /* Convert values to log. */
00380             model->lm3g.prob3[i].l = logmath_log10_to_log(lmath, model->lm3g.prob3[i].f);
00381         }
00382         E_INFO("%8d = LM.prob3 entries read\n", k);
00383     }
00384 
00385     /* read tseg_base size and tseg_base */
00386     if (do_mmap)
00387         offset = ftell(fp);
00388     if (n_trigram > 0) {
00389         if (do_mmap) {
00390             memcpy(&k, map_base + offset, sizeof(k));
00391             offset += sizeof(int32);
00392             model->lm3g.tseg_base = (int32 *) (map_base + offset);
00393             offset += k * sizeof(int32);
00394         }
00395         else {
00396             k = (n_bigram + 1) / BG_SEG_SZ + 1;
00397             if (fread(&k, sizeof(k), 1, fp) != 1)
00398                 goto error_out;
00399             if (do_swap) SWAP_INT32(&k);
00400             model->lm3g.tseg_base = ckd_calloc(k, sizeof(int32));
00401             if (fread(model->lm3g.tseg_base, sizeof(int32), k, fp) !=
00402                 (size_t) k) {
00403                 E_ERROR("fread(tseg_base) failed\n");
00404                 goto error_out;
00405             }
00406             if (do_swap)
00407                 for (i = 0; i < k; i++)
00408                     SWAP_INT32(&model->lm3g.tseg_base[i]);
00409         }
00410         E_INFO("%8d = LM.tseg_base entries read\n", k);
00411     }
00412 
00413     /* read ascii word strings */
00414     if (do_mmap) {
00415         memcpy(&k, map_base + offset, sizeof(k));
00416         offset += sizeof(int32);
00417         tmp_word_str = (char *) (map_base + offset);
00418         offset += k;
00419     }
00420     else {
00421         base->writable = TRUE;
00422         if (fread(&k, sizeof(k), 1, fp) != 1)
00423             goto error_out;
00424         if (do_swap) SWAP_INT32(&k);
00425         tmp_word_str = ckd_calloc(k, 1);
00426         if (fread(tmp_word_str, 1, k, fp) != (size_t) k) {
00427             E_ERROR("fread(word-string) failed\n");
00428             goto error_out;
00429         }
00430     }
00431 
00432     /* First make sure string just read contains n_counts[0] words (PARANOIA!!) */
00433     for (i = 0, j = 0; i < k; i++)
00434         if (tmp_word_str[i] == '\0')
00435             j++;
00436     if (j != n_unigram) {
00437         E_ERROR("Error reading word strings (%d doesn't match n_unigrams %d)\n",
00438                 j, n_unigram);
00439         goto error_out;
00440     }
00441 
00442     /* Break up string just read into words */
00443     if (do_mmap) {
00444         j = 0;
00445         for (i = 0; i < n_unigram; i++) {
00446             base->word_str[i] = tmp_word_str + j;
00447             if (hash_table_enter(base->wid, base->word_str[i],
00448                                  (void *)(long)i) != (void *)(long)i) {
00449                 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
00450             }
00451             j += strlen(base->word_str[i]) + 1;
00452         }
00453     }
00454     else {
00455         j = 0;
00456         for (i = 0; i < n_unigram; i++) {
00457             base->word_str[i] = ckd_salloc(tmp_word_str + j);
00458             if (hash_table_enter(base->wid, base->word_str[i],
00459                                  (void *)(long)i) != (void *)(long)i) {
00460                 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
00461             }
00462             j += strlen(base->word_str[i]) + 1;
00463         }
00464         free(tmp_word_str);
00465     }
00466     E_INFO("%8d = ascii word strings read\n", i);
00467 
00468     fclose_comp(fp, is_pipe);
00469     return base;
00470 
00471 error_out:
00472     if (fp)
00473         fclose_comp(fp, is_pipe);
00474     ngram_model_free(base);
00475     return NULL;
00476 }
00477 
00478 ngram_model_dmp_t *
00479 ngram_model_dmp_build(ngram_model_t *base)
00480 {
00481     ngram_model_dmp_t *model;
00482     ngram_model_t *newbase;
00483     ngram_iter_t *itor;
00484     sorted_list_t sorted_prob2;
00485     sorted_list_t sorted_bo_wt2;
00486     sorted_list_t sorted_prob3;
00487     bigram_t *bgptr;
00488     trigram_t *tgptr;
00489     int i, bgcount, tgcount, seg;
00490 
00491     if (base->funcs == &ngram_model_dmp_funcs) {
00492         E_INFO("Using existing DMP model.\n");
00493         return (ngram_model_dmp_t *)ngram_model_retain(base);
00494     }
00495 
00496     /* Initialize new base model structure with params from base. */
00497     E_INFO("Building DMP model...\n");
00498     model = ckd_calloc(1, sizeof(*model));
00499     newbase = &model->base;
00500     ngram_model_init(newbase, &ngram_model_dmp_funcs,
00501                      logmath_retain(base->lmath),
00502                      base->n, base->n_counts[0]);
00503     /* Copy N-gram counts over. */
00504     memcpy(newbase->n_counts, base->n_counts,
00505            base->n * sizeof(*base->n_counts));
00506     /* Make sure word strings are freed. */
00507     newbase->writable = TRUE;
00508     /* Initialize unigram table and string table. */
00509     model->lm3g.unigrams = new_unigram_table(newbase->n_counts[0] + 1);
00510     for (itor = ngram_model_mgrams(base, 0); itor;
00511          itor = ngram_iter_next(itor)) {
00512         int32 prob1, bo_wt1;
00513         int32 const *wids;
00514 
00515         /* Can't guarantee they will go in unigram order, so just to
00516          * be correct, we do this... */
00517         wids = ngram_iter_get(itor, &prob1, &bo_wt1);
00518         model->lm3g.unigrams[wids[0]].prob1.l = prob1;
00519         model->lm3g.unigrams[wids[0]].bo_wt1.l = bo_wt1;
00520         newbase->word_str[wids[0]] = ckd_salloc(ngram_word(base, wids[0]));
00521         if ((hash_table_enter_int32(newbase->wid,
00522                                     newbase->word_str[wids[0]], wids[0]))
00523             != wids[0]) {
00524                 E_WARN("Duplicate word in dictionary: %s\n", newbase->word_str[wids[0]]);
00525         }
00526     }
00527     E_INFO("%8d = #unigrams created\n", newbase->n_counts[0]);
00528                 
00529     if (newbase->n < 2) 
00530         return model;
00531                          
00532     /* Construct quantized probability table for bigrams and
00533      * (optionally) trigrams.  Hesitate to use the "sorted list" thing
00534      * since it isn't so useful, but it's there already. */
00535     init_sorted_list(&sorted_prob2);
00536     if (newbase->n > 2) {
00537         init_sorted_list(&sorted_bo_wt2);
00538         init_sorted_list(&sorted_prob3);
00539     }
00540     /* Construct bigram and trigram arrays. */
00541     bgptr = model->lm3g.bigrams = ckd_calloc(newbase->n_counts[1] + 1, sizeof(bigram_t));
00542     if (newbase->n > 2) {
00543         tgptr = model->lm3g.trigrams = ckd_calloc(newbase->n_counts[2], sizeof(trigram_t));
00544         model->lm3g.tseg_base =
00545             ckd_calloc((newbase->n_counts[1] + 1) / BG_SEG_SZ + 1, sizeof(int32));
00546     }
00547     else
00548         tgptr = NULL;
00549     /* Since bigrams and trigrams have to be contiguous with others
00550      * with the same N-1-gram, we traverse them in depth-first order
00551      * to build the bigram and trigram arrays. */
00552     for (i = 0; i < newbase->n_counts[0]; ++i) {
00553         ngram_iter_t *uitor;
00554         bgcount = bgptr - model->lm3g.bigrams;
00555         /* First bigram index (same as next if no bigrams...) */
00556         model->lm3g.unigrams[i].bigrams = bgcount;
00557         E_DEBUG(2, ("unigram %d: %s => bigram %d\n", i, newbase->word_str[i], bgcount));
00558         /* All bigrams corresponding to unigram i */
00559         uitor = ngram_ng_iter(base, i, NULL, 0);
00560         for (itor = ngram_iter_successors(uitor);
00561              itor; ++bgptr, itor = ngram_iter_next(itor)) {
00562             int32 prob2, bo_wt2;
00563             int32 const *wids;
00564             ngram_iter_t *titor;
00565 
00566             wids = ngram_iter_get(itor, &prob2, &bo_wt2);
00567 
00568             assert (bgptr - model->lm3g.bigrams < newbase->n_counts[1]);
00569 
00570             bgptr->wid = wids[1];
00571             bgptr->prob2 = sorted_id(&sorted_prob2, &prob2);
00572             if (newbase->n > 2) {
00573                 tgcount = (tgptr - model->lm3g.trigrams);
00574                 bgcount = (bgptr - model->lm3g.bigrams);
00575 
00576                 /* Backoff weight (only if there are trigrams...) */
00577                 bgptr->bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt2);
00578 
00579                 /* Find bigram segment for this bigram (this isn't
00580                  * used unless there are trigrams) */
00581                 seg = bgcount >> LOG_BG_SEG_SZ;
00582                 /* If we just crossed a bigram segment boundary, then
00583                  * point tseg_base for the new segment to the current
00584                  * trigram pointer. */
00585                 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
00586                     model->lm3g.tseg_base[seg] = tgcount;
00587                 /* Now calculate the trigram offset. */
00588                 bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg];
00589                 E_DEBUG(2, ("bigram %d %s %s => trigram %d:%d\n",
00590                             bgcount,
00591                             newbase->word_str[wids[0]],
00592                             newbase->word_str[wids[1]],
00593                             seg, bgptr->trigrams));
00594 
00595                 /* And fill in successors' trigram info. */
00596                 for (titor = ngram_iter_successors(itor);
00597                      titor; ++tgptr, titor = ngram_iter_next(titor)) {
00598                     int32 prob3, dummy;
00599 
00600                     assert(tgptr - model->lm3g.trigrams < newbase->n_counts[2]);
00601                     wids = ngram_iter_get(titor, &prob3, &dummy);
00602                     tgptr->wid = wids[2];
00603                     tgptr->prob3 = sorted_id(&sorted_prob3, &prob3);
00604                     E_DEBUG(2, ("trigram %d %s %s %s => prob %d\n",
00605                                 tgcount,
00606                                 newbase->word_str[wids[0]],
00607                                 newbase->word_str[wids[1]],
00608                                 newbase->word_str[wids[2]],
00609                                 tgptr->prob3));
00610                 }
00611             }
00612         }
00613         ngram_iter_free(uitor);
00614     }
00615     /* Add sentinal unigram and bigram records. */
00616     bgcount = bgptr - model->lm3g.bigrams;
00617     tgcount = tgptr - model->lm3g.trigrams;
00618     seg = bgcount >> LOG_BG_SEG_SZ;
00619     if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
00620         model->lm3g.tseg_base[seg] = tgcount;
00621     model->lm3g.unigrams[i].bigrams = bgcount;
00622     if (newbase->n > 2)
00623         bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg];
00624 
00625     /* Now create probability tables. */
00626     model->lm3g.n_prob2 = sorted_prob2.free;
00627     model->lm3g.prob2 = vals_in_sorted_list(&sorted_prob2);
00628     E_INFO("%8d = #bigrams created\n", newbase->n_counts[1]);
00629     E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2);
00630     free_sorted_list(&sorted_prob2);
00631     if (newbase->n > 2) {
00632         /* Create trigram bo-wts array. */
00633         model->lm3g.n_bo_wt2 = sorted_bo_wt2.free;
00634         model->lm3g.bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2);
00635         free_sorted_list(&sorted_bo_wt2);
00636         E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2);
00637         /* Create trigram probability table. */
00638         model->lm3g.n_prob3 = sorted_prob3.free;
00639         model->lm3g.prob3 = vals_in_sorted_list(&sorted_prob3);
00640         E_INFO("%8d = #trigrams created\n", newbase->n_counts[2]);
00641         E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3);
00642         free_sorted_list(&sorted_prob3);
00643         /* Initialize tginfo */
00644         model->lm3g.tginfo = ckd_calloc(newbase->n_counts[0], sizeof(tginfo_t *));
00645         model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
00646     }
00647 
00648     return model;
00649 }
00650 
00651 static void
00652 fwrite_int32(FILE *fh, int32 val)
00653 {
00654     fwrite(&val, 4, 1, fh);
00655 }
00656 
00657 static void
00658 fwrite_ug(FILE *fh, unigram_t *ug, logmath_t *lmath)
00659 {
00660     int32 bogus = -1;
00661     float32 log10val;
00662 
00663     /* Bogus dictionary mapping field. */
00664     fwrite(&bogus, 4, 1, fh);
00665     /* Convert values to log10. */
00666     log10val = logmath_log_to_log10(lmath, ug->prob1.l);
00667     fwrite(&log10val, 4, 1, fh);
00668     log10val = logmath_log_to_log10(lmath, ug->bo_wt1.l);
00669     fwrite(&log10val, 4, 1, fh);
00670     fwrite_int32(fh, ug->bigrams);
00671 }
00672 
00673 static void
00674 fwrite_bg(FILE *fh, bigram_t *bg)
00675 {
00676     fwrite(bg, sizeof(*bg), 1, fh);
00677 }
00678 
00679 static void
00680 fwrite_tg(FILE *fh, trigram_t *tg)
00681 {
00682     fwrite(tg, sizeof(*tg), 1, fh);
00683 }
00684 
00687 static char const *fmtdesc[] = {
00688     "BEGIN FILE FORMAT DESCRIPTION",
00689     "Header string length (int32) and string (including trailing 0)",
00690     "Original LM filename string-length (int32) and filename (including trailing 0)",
00691     "(int32) version number (present iff value <= 0)",
00692     "(int32) original LM file modification timestamp (iff version# present)",
00693     "(int32) string-length and string (including trailing 0) (iff version# present)",
00694     "... previous entry continued any number of times (iff version# present)",
00695     "(int32) 0 (terminating sequence of strings) (iff version# present)",
00696     "(int32) log_bg_seg_sz (present iff different from default value of LOG2_BG_SEG_SZ)",
00697     "(int32) lm_t.ucount (must be > 0)",
00698     "(int32) lm_t.bcount",
00699     "(int32) lm_t.tcount",
00700     "lm_t.ucount+1 unigrams (including sentinel)",
00701     "lm_t.bcount+1 bigrams (including sentinel 64 bits (bg_t) each if version=-1/-2, 128 bits (bg32_t) each if version=-3",
00702     "lm_t.tcount trigrams (present iff lm_t.tcount > 0 32 bits (tg_t) each if version=-1/-2, 64 bits (tg32_t) each if version=-3)",
00703     "(int32) lm_t.n_prob2",
00704     "(int32) lm_t.prob2[]",
00705     "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)",
00706     "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)",
00707     "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)",
00708     "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)",
00709     "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)",
00710     "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)",
00711     "(int32) Sum(all word string-lengths, including trailing 0 for each)",
00712     "All word strings (including trailing 0 for each)",
00713     "END FILE FORMAT DESCRIPTION",
00714     NULL,
00715 };
00716 
00717 static void
00718 ngram_model_dmp_write_header(FILE * fh)
00719 {
00720     int32 k;
00721     k = strlen(darpa_hdr) + 1;
00722     fwrite_int32(fh, k);
00723     fwrite(darpa_hdr, 1, k, fh);
00724 }
00725 
00726 static void
00727 ngram_model_dmp_write_lm_filename(FILE * fh, const char *lmfile)
00728 {
00729     int32 k;
00730 
00731     k = strlen(lmfile) + 1;
00732     fwrite_int32(fh, k);
00733     fwrite(lmfile, 1, k, fh);
00734 }
00735 
00736 #define LMDMP_VERSION_TG_16BIT -1 
00740 static void
00741 ngram_model_dmp_write_version(FILE * fh, int32 mtime)
00742 {
00743     fwrite_int32(fh, LMDMP_VERSION_TG_16BIT);   /* version # */
00744     fwrite_int32(fh, mtime);
00745 }
00746 
00747 static void
00748 ngram_model_dmp_write_ngram_counts(FILE * fh, ngram_model_t *model)
00749 {
00750     fwrite_int32(fh, model->n_counts[0]);
00751     fwrite_int32(fh, model->n_counts[1]);
00752     fwrite_int32(fh, model->n_counts[2]);
00753 }
00754 
00755 static void
00756 ngram_model_dmp_write_fmtdesc(FILE * fh)
00757 {
00758     int32 i, k;
00759     long pos;
00760 
00761     /* Write file format description into header */
00762     for (i = 0; fmtdesc[i] != NULL; i++) {
00763         k = strlen(fmtdesc[i]) + 1;
00764         fwrite_int32(fh, k);
00765         fwrite(fmtdesc[i], 1, k, fh);
00766     }
00767     /* Pad it out in order to achieve 32-bit alignment */
00768     pos = ftell(fh);
00769     k = pos & 3;
00770     if (k) {
00771         fwrite_int32(fh, 4-k);
00772         fwrite("!!!!", 1, 4-k, fh);
00773     }
00774     fwrite_int32(fh, 0);
00775 }
00776 
00777 static void
00778 ngram_model_dmp_write_unigram(FILE *fh, ngram_model_t *model)
00779 {
00780     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00781     int32 i;
00782 
00783     for (i = 0; i <= model->n_counts[0]; i++) {
00784         fwrite_ug(fh, &(lm->lm3g.unigrams[i]), model->lmath);
00785     }
00786 }
00787 
00788 
00789 static void
00790 ngram_model_dmp_write_bigram(FILE *fh, ngram_model_t *model)
00791 {
00792     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00793     int32 i;
00794 
00795     for (i = 0; i <= model->n_counts[1]; i++) {
00796         fwrite_bg(fh, &(lm->lm3g.bigrams[i]));
00797     }
00798 
00799 }
00800 
00801 static void
00802 ngram_model_dmp_write_trigram(FILE *fh, ngram_model_t *model)
00803 {
00804     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00805     int32 i;
00806 
00807     for (i = 0; i < model->n_counts[2]; i++) {
00808         fwrite_tg(fh, &(lm->lm3g.trigrams[i]));
00809     }
00810 }
00811 
00812 static void
00813 ngram_model_dmp_write_bgprob(FILE *fh, ngram_model_t *model)
00814 {
00815     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00816     int32 i;
00817 
00818     fwrite_int32(fh, lm->lm3g.n_prob2);
00819     for (i = 0; i < lm->lm3g.n_prob2; i++) {
00820         float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob2[i].l);
00821         fwrite(&log10val, 4, 1, fh);
00822     }
00823 }
00824 
00825 static void
00826 ngram_model_dmp_write_tgbowt(FILE *fh, ngram_model_t *model)
00827 {
00828     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00829     int32 i;
00830 
00831     fwrite_int32(fh, lm->lm3g.n_bo_wt2);
00832     for (i = 0; i < lm->lm3g.n_bo_wt2; i++) {
00833         float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.bo_wt2[i].l);
00834         fwrite(&log10val, 4, 1, fh);
00835     }
00836 }
00837 
00838 static void
00839 ngram_model_dmp_write_tgprob(FILE *fh, ngram_model_t *model)
00840 {
00841     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00842     int32 i;
00843 
00844     fwrite_int32(fh, lm->lm3g.n_prob3);
00845     for (i = 0; i < lm->lm3g.n_prob3; i++) {
00846         float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob3[i].l);
00847         fwrite(&log10val, 4, 1, fh);
00848     }
00849 }
00850 
00851 static void
00852 ngram_model_dmp_write_tg_segbase(FILE *fh, ngram_model_t *model)
00853 {
00854     ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
00855     int32 i, k;
00856 
00857     k = (model->n_counts[1] + 1) / BG_SEG_SZ + 1;
00858     fwrite_int32(fh, k);
00859     for (i = 0; i < k; i++)
00860         fwrite_int32(fh, lm->lm3g.tseg_base[i]);
00861 }
00862 
00863 static void
00864 ngram_model_dmp_write_wordstr(FILE *fh, ngram_model_t *model)
00865 {
00866     int32 i, k;
00867 
00868     k = 0;
00869     for (i = 0; i < model->n_counts[0]; i++)
00870         k += strlen(model->word_str[i]) + 1;
00871     fwrite_int32(fh, k);
00872     for (i = 0; i < model->n_counts[0]; i++)
00873         fwrite(model->word_str[i], 1,
00874                strlen(model->word_str[i]) + 1, fh);
00875 }
00876 
00877 int
00878 ngram_model_dmp_write(ngram_model_t *base,
00879                       const char *file_name)
00880 {
00881     ngram_model_dmp_t *model;
00882     ngram_model_t *newbase;
00883     FILE *fh;
00884 
00885     /* First, construct a DMP model from the base model. */
00886     model = ngram_model_dmp_build(base);
00887     newbase = &model->base;
00888 
00889     /* Now write it, confident in the knowledge that it's the right
00890      * kind of language model internally. */
00891     if ((fh = fopen(file_name, "wb")) == NULL) {
00892         E_ERROR("Cannot create file %s\n", file_name);
00893         return -1;
00894     }
00895     ngram_model_dmp_write_header(fh);
00896     ngram_model_dmp_write_lm_filename(fh, file_name);
00897     ngram_model_dmp_write_version(fh, 0);
00898     ngram_model_dmp_write_fmtdesc(fh);
00899     ngram_model_dmp_write_ngram_counts(fh, newbase);
00900     ngram_model_dmp_write_unigram(fh, newbase);
00901     if (newbase->n > 1) {
00902         ngram_model_dmp_write_bigram(fh, newbase);
00903         if (newbase->n > 2) {
00904             ngram_model_dmp_write_trigram(fh, newbase);
00905         }
00906         ngram_model_dmp_write_bgprob(fh, newbase);
00907         if (newbase->n > 2) {
00908                 ngram_model_dmp_write_tgbowt(fh, newbase);
00909                 ngram_model_dmp_write_tgprob(fh, newbase);
00910                 ngram_model_dmp_write_tg_segbase(fh, newbase);
00911         }
00912     }
00913     ngram_model_dmp_write_wordstr(fh, newbase);
00914     ngram_model_free(newbase);
00915 
00916     return fclose(fh);
00917 }
00918 
00919 static int
00920 ngram_model_dmp_apply_weights(ngram_model_t *base, float32 lw,
00921                               float32 wip, float32 uw)
00922 {
00923     ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
00924     lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
00925     return 0;
00926 }
00927 
00928 /* Lousy "templating" for things that are largely the same in DMP and
00929  * ARPA models, except for the bigram and trigram types and some
00930  * names. */
00931 #define NGRAM_MODEL_TYPE ngram_model_dmp_t
00932 #include "lm3g_templates.c"
00933 
00934 static void
00935 ngram_model_dmp_free(ngram_model_t *base)
00936 {
00937     ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
00938 
00939     ckd_free(model->lm3g.unigrams);
00940     ckd_free(model->lm3g.prob2);
00941     if (model->dump_mmap) {
00942         mmio_file_unmap(model->dump_mmap);
00943     } 
00944     else {
00945         ckd_free(model->lm3g.bigrams);
00946         if (base->n > 2) {
00947             ckd_free(model->lm3g.trigrams);
00948             ckd_free(model->lm3g.tseg_base);
00949         }
00950     }
00951     if (base->n > 2) {
00952         ckd_free(model->lm3g.bo_wt2);
00953         ckd_free(model->lm3g.prob3);
00954     }
00955 
00956     lm3g_tginfo_free(base, &model->lm3g);
00957 }
00958 
00959 static ngram_funcs_t ngram_model_dmp_funcs = {
00960     ngram_model_dmp_free,          /* free */
00961     ngram_model_dmp_apply_weights, /* apply_weights */
00962     lm3g_template_score,           /* score */
00963     lm3g_template_raw_score,       /* raw_score */
00964     lm3g_template_add_ug,          /* add_ug */
00965     lm3g_template_flush,           /* flush */
00966     lm3g_template_iter,             /* iter */
00967     lm3g_template_mgrams,          /* mgrams */
00968     lm3g_template_successors,      /* successors */
00969     lm3g_template_iter_get,        /* iter_get */
00970     lm3g_template_iter_next,       /* iter_next */
00971     lm3g_template_iter_free        /* iter_free */
00972 };