1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
|
/*
* libpinyin
* Library to deal with pinyin.
*
* Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "pinyin_internal.h"
#include "k_mixture_model.h"
void print_help(){
printf("Usage: validate_k_mixture_model <FILENAME>\n");
}
bool validate_unigram(KMixtureModelBigram * bigram){
KMixtureModelMagicHeader magic_header;
if( !bigram->get_magic_header(magic_header) ){
fprintf(stderr, "no magic header in k mixture model.\n");
return false;
}
guint32 expected_word_count = magic_header.m_WC;
if ( 0 == expected_word_count ){
fprintf(stderr, "word count in magic header is unexpected zero.\n");
return false;
}
guint32 expected_total_freq = magic_header.m_total_freq;
if ( 0 == expected_total_freq ){
fprintf(stderr, "total freq in magic header is unexpected zero.\n");
return false;
}
if ( expected_word_count != expected_total_freq ){
fprintf(stderr, "the word count doesn't match the total freq.\n");
return false;
}
GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
bigram->get_all_items(items);
guint32 word_count = 0; guint32 total_freq = 0;
for (size_t i = 0; i < items->len; ++i) {
phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
KMixtureModelArrayHeader array_header;
check_result(bigram->get_array_header(*token, array_header));
word_count += array_header.m_WC;
total_freq += array_header.m_freq;
}
if ( word_count != expected_word_count ){
fprintf(stderr, "word count in magic header:%d\n",
expected_word_count);
fprintf(stderr, "sum of word count in array headers:%d\n", word_count);
fprintf(stderr, "the sum differs from word count.\n");
return false;
}
if ( total_freq != expected_total_freq ){
fprintf(stderr, "total freq in magic header:%d\n",
expected_total_freq);
fprintf(stderr, "sum of freqs in array headers:%d\n", total_freq);
fprintf(stderr, "the total freq differs from sum of freqs.\n");
return false;
}
g_array_free(items, TRUE);
return true;
}
bool validate_bigram(KMixtureModelBigram * bigram){
bool result = true;
GArray * items = g_array_new(FALSE, FALSE, sizeof(phrase_token_t));
bigram->get_all_items(items);
for (size_t i = 0; i < items->len; ++i) {
phrase_token_t * token = &g_array_index(items, phrase_token_t, i);
KMixtureModelSingleGram * single_gram = NULL;
check_result(bigram->load(*token, single_gram));
FlexibleBigramPhraseArray array = g_array_new
(FALSE, FALSE, sizeof(KMixtureModelArrayItemWithToken));
single_gram->retrieve_all(array);
KMixtureModelArrayHeader array_header;
check_result(single_gram->get_array_header(array_header));
guint32 expected_sum = array_header.m_WC;
guint32 freq = array_header.m_freq;
if ( 0 == expected_sum ){
if ( 0 != array->len ){
fprintf(stderr, "in the array header of token %d:\n", *token);
fprintf(stderr, "word count is zero but has array items.\n");
result = false;
}
if ( 0 != freq ){
delete single_gram;
continue;
} else {
fprintf(stderr, "in the array header of token %d:\n", *token);
fprintf(stderr, "both word count and freq are "
"unexpected zero.\n");
result = false;
}
}
guint32 sum = 0;
for (size_t m = 0; m< array->len; ++m){
KMixtureModelArrayItemWithToken * item = &g_array_index(array, KMixtureModelArrayItemWithToken, m);
sum += item->m_item.m_WC;
}
if ( sum != expected_sum ){
fprintf(stderr, "word count in array header:%d\n", expected_sum);
fprintf(stderr, "sum of word count in array items:%d\n", sum);
fprintf(stderr, "the sum differs from word count.\n");
result = false;
}
g_array_free(array, TRUE);
delete single_gram;
}
g_array_free(items, TRUE);
return result;
}
int main(int argc, char * argv[]){
GError * error = NULL;
GOptionContext * context;
context = g_option_context_new("- validate k mixture model");
if (!g_option_context_parse(context, &argc, &argv, &error)) {
g_print("option parsing failed:%s\n", error->message);
exit(EINVAL);
}
if (2 != argc) {
fprintf(stderr, "wrong arguments.\n");
exit(EINVAL);
}
const char * k_mixture_model_filename = argv[1];
KMixtureModelBigram bigram(K_MIXTURE_MODEL_MAGIC_NUMBER);
bigram.attach(k_mixture_model_filename, ATTACH_READONLY);
if (!validate_unigram(&bigram)) {
fprintf(stderr, "k mixture model validation failed.\n");
exit(ENODATA);
}
if (!validate_bigram(&bigram)) {
fprintf(stderr, "k mixture model validation failed.\n");
exit(ENODATA);
}
return 0;
}
|