File: wicoo.c

package info (click to toggle)
bow 19991122-4
  • links: PTS
  • area: main
  • in suites: woody
  • size: 2,544 kB
  • ctags: 2,987
  • sloc: ansic: 38,660; lisp: 1,072; makefile: 594; perl: 492; yacc: 149; sh: 91
file content (154 lines) | stat: -rw-r--r-- 3,692 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#include <bow/libbow.h>

bow_wi2dvf *
bow_wicoo_from_barrel  (bow_barrel *barrel)
{
  bow_wi2dvf *wicoo;
  bow_dv_heap *heap;
  float num_words_in_wv;
  int wvi1, wvi2;
  bow_wv *wv;
  int di;
  bow_dv *dv;

  wicoo = bow_wi2dvf_new (0);

  /* Add statistics for all word co-occurrences. */
  /* And prepare to set IDF to Pr(w) */
  heap = bow_test_new_heap (barrel);
  wv = NULL;
  bow_verbosify (bow_progress,
		 "Calculating word co-occurrences          ");
  while ((di = bow_nontest_next_wv (heap, barrel, &wv))
	 != -1)
    {
      if (di % 10 == 0)
	bow_verbosify (bow_progress, "\b\b\b\b\b\b\b%7d", di);

      /* Calculate the total number of words in WV */
      num_words_in_wv = 0;
      for (wvi1 = 0; wvi1 < wv->num_entries; wvi1++)
	{
#if 0
	  /* Only count those words that are part of the vocabulary. */
	  if (bow_str2int_no_add (rainbowh_arg_state.vocab_map,
				  bow_int2word (wv->entry[wvi1].wi))
	      != -1)
#endif
	    num_words_in_wv += wv->entry[wvi1].count;
	}

      for (wvi1 = 0; wvi1 < wv->num_entries; wvi1++)
	{
	  for (wvi2 = 0; wvi2 < wv->num_entries; wvi2++)
	    {
	      /* Set COUNT to co-occurrence count.
		 Set WEIGHT to probabilistic sampling of document,
		 then word. */
	      bow_wi2dvf_add_wi_di_count_weight
		(&wicoo, wv->entry[wvi1].wi, wv->entry[wvi2].wi,
		 wv->entry[wvi2].count,
		 wv->entry[wvi2].count / num_words_in_wv);
	    }
	  dv = bow_wi2dvf_dv (wicoo, wv->entry[wvi1].wi);
	  /* This relies on IDF being initialized to zero in bow_dv_new() */
	  dv->idf += wv->entry[wvi1].count / num_words_in_wv;
	}
    }

  /* Normalize the IDF's so they are equal to Pr(w) in the corpus. */
  {
    int wi;
    double idf_total = 0;
    for (wi = 0; wi < wicoo->size; wi++)
      {
	dv = bow_wi2dvf_dv (wicoo, wi);
	if (dv)
	  idf_total += dv->idf;
      }
    for (wi = 0; wi < wicoo->size; wi++)
      {
	dv = bow_wi2dvf_dv (wicoo, wi);
	if (dv)
	  dv->idf /= idf_total;
      }
  }

  bow_verbosify (bow_progress, "\n");

  return wicoo;
}

void
bow_wicoo_pr_w_w (bow_wi2dvf *wicoo, int wi1, int wi2)
{
}

void
bow_wicoo_print_word_entropy (bow_wi2dvf *wicoo, int wi)
{
  bow_dv *coov;
  float total_num_coo_words;
  float pr_w_w;
  float total_pr_w_w;
  int coovi;
  float entropy;
  int wi2, max_wi;
  int m_est_m;
  float m_est_p;
  bow_dv *dv2;

  coov = bow_wi2dvf_dv (wicoo, wi);
  if (!coov)
    return;

  total_num_coo_words = 0;
  for (coovi = 0; coovi < coov->length; coovi++)
    total_num_coo_words += coov->entry[coovi].weight;

  entropy = 0;
  max_wi = bow_num_words ();
  m_est_m = wicoo->num_words / 100;
  total_pr_w_w = 0;
  for (wi2 = 0, coovi = 0; wi2 < max_wi; wi2++)
    {
      dv2 = bow_wi2dvf_dv (wicoo, wi2);
      if (!dv2)
	continue;
      m_est_p = dv2->idf;

      while (coov->entry[coovi].di < wi2 && coovi < coov->length)
	coovi++;
      if (coov->entry[coovi].di == wi2)
	{
	  /* Found word WI2 in vector. */
	  pr_w_w = (((float)coov->entry[coovi].weight + m_est_m * m_est_p)
		    / (total_num_coo_words + m_est_m));
	}
      else
	{
	  /* Word WI2 does not co-occur with WI. */
	  pr_w_w = ((m_est_m * m_est_p)
		    / (total_num_coo_words + m_est_m));
	}
#if 1
      printf ("%-30s %12.7f %s\n",
	      bow_int2word (wi), pr_w_w, bow_int2word (wi2));
#endif
      /* pr_w_w = (float)coov->entry[coovi].weight / total_num_coo_words; */
      total_pr_w_w += pr_w_w;
      entropy -= pr_w_w * log (pr_w_w);
    }
  assert (total_pr_w_w > 0.99 && total_pr_w_w < 1.01);
  printf ("%-15.7f %s\n", entropy, bow_int2word (wi));
}

/* Shrink the weights of WV toward documents in BARREL, according to
   their distance to WV. */
void
bow_barrel_shrink_wv (bow_barrel *barrel, bow_wv *wv)
{
  return;
}