1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
|
#include <bow/libbow.h>
bow_wi2dvf *
bow_wicoo_from_barrel (bow_barrel *barrel)
{
bow_wi2dvf *wicoo;
bow_dv_heap *heap;
float num_words_in_wv;
int wvi1, wvi2;
bow_wv *wv;
int di;
bow_dv *dv;
wicoo = bow_wi2dvf_new (0);
/* Add statistics for all word co-occurrences. */
/* And prepare to set IDF to Pr(w) */
heap = bow_test_new_heap (barrel);
wv = NULL;
bow_verbosify (bow_progress,
"Calculating word co-occurrences ");
while ((di = bow_nontest_next_wv (heap, barrel, &wv))
!= -1)
{
if (di % 10 == 0)
bow_verbosify (bow_progress, "\b\b\b\b\b\b\b%7d", di);
/* Calculate the total number of words in WV */
num_words_in_wv = 0;
for (wvi1 = 0; wvi1 < wv->num_entries; wvi1++)
{
#if 0
/* Only count those words that are part of the vocabulary. */
if (bow_str2int_no_add (rainbowh_arg_state.vocab_map,
bow_int2word (wv->entry[wvi1].wi))
!= -1)
#endif
num_words_in_wv += wv->entry[wvi1].count;
}
for (wvi1 = 0; wvi1 < wv->num_entries; wvi1++)
{
for (wvi2 = 0; wvi2 < wv->num_entries; wvi2++)
{
/* Set COUNT to co-occurrence count.
Set WEIGHT to probabilistic sampling of document,
then word. */
bow_wi2dvf_add_wi_di_count_weight
(&wicoo, wv->entry[wvi1].wi, wv->entry[wvi2].wi,
wv->entry[wvi2].count,
wv->entry[wvi2].count / num_words_in_wv);
}
dv = bow_wi2dvf_dv (wicoo, wv->entry[wvi1].wi);
/* This relies on IDF being initialized to zero in bow_dv_new() */
dv->idf += wv->entry[wvi1].count / num_words_in_wv;
}
}
/* Normalize the IDF's so they are equal to Pr(w) in the corpus. */
{
int wi;
double idf_total = 0;
for (wi = 0; wi < wicoo->size; wi++)
{
dv = bow_wi2dvf_dv (wicoo, wi);
if (dv)
idf_total += dv->idf;
}
for (wi = 0; wi < wicoo->size; wi++)
{
dv = bow_wi2dvf_dv (wicoo, wi);
if (dv)
dv->idf /= idf_total;
}
}
bow_verbosify (bow_progress, "\n");
return wicoo;
}
void
bow_wicoo_pr_w_w (bow_wi2dvf *wicoo, int wi1, int wi2)
{
}
void
bow_wicoo_print_word_entropy (bow_wi2dvf *wicoo, int wi)
{
bow_dv *coov;
float total_num_coo_words;
float pr_w_w;
float total_pr_w_w;
int coovi;
float entropy;
int wi2, max_wi;
int m_est_m;
float m_est_p;
bow_dv *dv2;
coov = bow_wi2dvf_dv (wicoo, wi);
if (!coov)
return;
total_num_coo_words = 0;
for (coovi = 0; coovi < coov->length; coovi++)
total_num_coo_words += coov->entry[coovi].weight;
entropy = 0;
max_wi = bow_num_words ();
m_est_m = wicoo->num_words / 100;
total_pr_w_w = 0;
for (wi2 = 0, coovi = 0; wi2 < max_wi; wi2++)
{
dv2 = bow_wi2dvf_dv (wicoo, wi2);
if (!dv2)
continue;
m_est_p = dv2->idf;
while (coov->entry[coovi].di < wi2 && coovi < coov->length)
coovi++;
if (coov->entry[coovi].di == wi2)
{
/* Found word WI2 in vector. */
pr_w_w = (((float)coov->entry[coovi].weight + m_est_m * m_est_p)
/ (total_num_coo_words + m_est_m));
}
else
{
/* Word WI2 does not co-occur with WI. */
pr_w_w = ((m_est_m * m_est_p)
/ (total_num_coo_words + m_est_m));
}
#if 1
printf ("%-30s %12.7f %s\n",
bow_int2word (wi), pr_w_w, bow_int2word (wi2));
#endif
/* pr_w_w = (float)coov->entry[coovi].weight / total_num_coo_words; */
total_pr_w_w += pr_w_w;
entropy -= pr_w_w * log (pr_w_w);
}
assert (total_pr_w_w > 0.99 && total_pr_w_w < 1.01);
printf ("%-15.7f %s\n", entropy, bow_int2word (wi));
}
/* Shrink the weights of WV toward documents in BARREL, according to
their distance to WV. */
void
bow_barrel_shrink_wv (bow_barrel *barrel, bow_wv *wv)
{
return;
}
|