File: ngram.h

package info (click to toggle)
libgooglepinyin 0.1.2-4
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 7,640 kB
  • ctags: 841
  • sloc: cpp: 8,256; ansic: 200; makefile: 13
file content (96 lines) | stat: -rw-r--r-- 2,884 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
/*
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef PINYINIME_INCLUDE_NGRAM_H__
#define PINYINIME_INCLUDE_NGRAM_H__

#include <stdio.h>
#include <stdlib.h>
#include "./dictdef.h"

namespace ime_pinyin {

typedef unsigned char CODEBOOK_TYPE;

static const size_t kCodeBookSize = 256;

class NGram {
 public:
  // The maximum score of a lemma item.
  static const LmaScoreType kMaxScore = 0x3fff;

  // In order to reduce the storage size, the original log value is amplified by
  // kScoreAmplifier, and we use LmaScoreType to store.
  // After this process, an item with a lower score has a higher frequency.
  static const int kLogValueAmplifier = -800;

  // System words' total frequency. It is not the real total frequency, instead,
  // It is only used to adjust system lemmas' scores when the user dictionary's
  // total frequency changes.
  // In this version, frequencies of system lemmas are fixed. We are considering
  // to make them changable in next version.
  static const size_t kSysDictTotalFreq = 100000000;

 private:

  static NGram* instance_;

  bool initialized_;
  size_t idx_num_;

  size_t total_freq_none_sys_;

  // Score compensation for system dictionary lemmas.
  // Because after user adds some user lemmas, the total frequency changes, and
  // we use this value to normalize the score.
  float sys_score_compensation_;

#ifdef ___BUILD_MODEL___
  double *freq_codes_df_;
#endif
  LmaScoreType *freq_codes_;
  CODEBOOK_TYPE *lma_freq_idx_;

 public:
  NGram();
  ~NGram();

  static NGram& get_instance();

  bool save_ngram(FILE *fp);
  bool load_ngram(FILE *fp);

  // Set the total frequency of all none system dictionaries.
  void set_total_freq_none_sys(size_t freq_none_sys);

  float get_uni_psb(LemmaIdType lma_id);

  // Convert a probability to score. Actually, the score will be limited to
  // kMaxScore, but at runtime, we also need float expression to get accurate
  // value of the score.
  // After the conversion, a lower score indicates a higher probability of the
  // item.
  static float convert_psb_to_score(double psb);

#ifdef ___BUILD_MODEL___
  // For constructing the unigram mode model.
  bool build_unigram(LemmaEntry *lemma_arr, size_t num,
                     LemmaIdType next_idx_unused);
#endif
};
}

#endif  // PINYINIME_INCLUDE_NGRAM_H__