File: heap.c

package info (click to toggle)
bow 20020213-8
  • links: PTS
  • area: main
  • in suites: sarge
  • size: 2,596 kB
  • ctags: 2,871
  • sloc: ansic: 36,321; lisp: 1,072; cpp: 969; makefile: 569; perl: 495; sh: 101
file content (213 lines) | stat: -rw-r--r-- 6,335 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
/* Implementation of the ?? heap for the bow library */

/* Copyright (C) 1997, 1998, 1999 Andrew McCallum

   Written by:  Sean Slattery <slttery@cs.cmu.edu>

   This file is part of the Bag-Of-Words Library, `libbow'.

   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public License
   as published by the Free Software Foundation, version 2.
   
   This library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with this library; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */


#include <bow/libbow.h>

/* Some HEAP thingys */
#define PARENT(i) ((i) / 2)
#define LEFT(i) (2 * (i))
#define RIGHT(i) (2 * (i) + 1)

/* We index from 1->length - do the translation back to 0-(length-1) here */
#define HEAP_ELT(i) (heap->entry[i-1])
#define HEAP_KEY(i) (HEAP_ELT(i).current_di)
#define HEAP_WI(i)  (HEAP_ELT(i).wi) 

/* Return true if a belongs higher up the heap than b. For a Max Heap the
   compare should be >=, and for a Min Heap it should be <= */
#define HEAP_CMP(a,b) ((a) < (b))

/* Comparing the wi's mean we get the words out in order from lowest to 
   highest. */
#define HEAP_HIGHER(a,b) (HEAP_CMP(HEAP_KEY(a), HEAP_KEY(b)) || ((HEAP_KEY(a) == HEAP_KEY(b)) && (HEAP_WI(a) < HEAP_WI(b))))

/* Function to turn an array of bow_dv_heap_elements into a proper heap */
void
bow_heapify(bow_dv_heap *heap, int i)
{
  int l, r, highest;
  bow_dv_heap_element tmp;

  l = LEFT(i);
  r = RIGHT(i);

  if ((l <= heap->length) && 
      HEAP_HIGHER(l, i))
    highest = l;
  else
    highest = i;

  if ((r <= heap->length) && 
      HEAP_HIGHER(r, highest))
    highest = r;

  if (highest != i) {
    tmp = HEAP_ELT(i);
    HEAP_ELT(i) = HEAP_ELT(highest);
    HEAP_ELT(highest) = tmp;

    /* Need to recurse to fix the subtree */
    bow_heapify (heap, highest);
  }
}

/* Function to create a heap of the vectors of documents associated
   with each word in the word vector. */
bow_dv_heap *
bow_make_dv_heap_from_wv (bow_wi2dvf *wi2dvf, bow_wv *wv)
{
  int wv_index;			/* an index into the "word vector" */
  int heap_index;		/* an index into the heap we are creating */
  int wi, i;
  bow_dv_heap *heap;
  bow_dv *dv;

  heap = bow_malloc (sizeof (bow_dv_heap) 
		     + (sizeof (bow_dv_heap_element) 
			* (wv->num_entries)));

  /* Dump all the elements into the heap */
  heap_index = 0;
  for (wv_index = 0; wv_index < wv->num_entries; wv_index++)
    {
      /* Get the word index */
      wi = wv->entry[wv_index].wi;  

      /* Now fetch the list of documents associated with this word */
      /* Use this function instead of accessing the wi2dvf->entry
	 directly because the dv may need to be read from a file. */
      dv = bow_wi2dvf_dv (wi2dvf, wi);
      if (!dv)
	continue;
      heap->entry[heap_index].dv = dv;
      assert (dv->idf == dv->idf); /* Check for NaN */
      heap->entry[heap_index].wi = wi;
      heap->entry[heap_index].index = 0;
      heap->entry[heap_index].current_di = dv->entry[0].di;
      heap_index++;
    }

  /* Initialise the heap */
  heap->length = heap_index;

  /* Now need to make this baby into a heap. We'll use i to index into
     a conceptual array with indices 1..length and convert those
     references to index an array of 0..(length-1) when required. */
  for (i = (heap->length)/2; i > 0; i--) 
    bow_heapify (heap, i);

  return heap;
}

/* Function to take the top element of the heap - move it's index along and
   place it back in the heap. */
void
bow_dv_heap_update (bow_dv_heap *heap)
{
  bow_dv_heap_element *top = &(heap->entry[0]);

  /* Increment the index */
  (top->index)++;

  /* Check to make sure we have elements left to look at */
  if (top->index < top->dv->length) 
    {
      top->current_di = top->dv->entry[top->index].di;

      /* Heapify!! */
      bow_heapify (heap, 1);

    }
  else
    {
      /* Here we draft in the end of the heap and Heapify */
      heap->entry[0] = heap->entry[(heap->length) - 1];

      (heap->length)--;

      bow_heapify (heap, 1);
    }
}

/* Function to make a heap from all the vectors of documents in the
   big data structure we've built.  If EVEN_IF_HIDDEN is non-zero,
   then words that have been "hidden" (by feature selection, for
   example) will none-the-less also be included in the WV's returned
   by future calls to the heap; think carefully before you do this! */
bow_dv_heap *
bow_make_dv_heap_from_wi2dvf_hidden (bow_wi2dvf *wi2dvf, int even_if_hidden)
{
  int wi;			/* a "word index", index into WI2DVF */
  int max_wi;			/* the highest "word index" */
  int hi;			/* a "heap index", an index into the heap */
  bow_dv_heap *heap;		/* what we are creating and returning */
  bow_dv *dv;

  max_wi = MIN (wi2dvf->size, bow_num_words ());
  heap = bow_malloc (sizeof (bow_dv_heap) 
		     + (sizeof (bow_dv_heap_element) * (max_wi)));

  /* Dump all the vectors of documents into the array */
  hi = 0;
  for (wi = 0; wi < max_wi; wi++)
    {
      dv = bow_wi2dvf_dv_hidden (wi2dvf, wi, even_if_hidden);
      if (dv)
	{
	  heap->entry[hi].dv = dv;
	  heap->entry[hi].wi = wi;
	  heap->entry[hi].index = 0;
	  heap->entry[hi].current_di = dv->entry[0].di;
	  /* xxx It would be nice to check for values too high, also.*/
	  assert (dv->entry[0].di >= 0);
	  hi++;
	}
    }

  /* Initialise the Heap */
  heap->length = hi;
  for (hi = (heap->length)/2; hi > 0; hi--) 
    bow_heapify (heap, hi);

  heap->heap_wv = NULL;
  /* This special -2 value used in split.c */
  heap->heap_wv_di = -2;
  heap->last_di = -2;

  return heap;
}

/* Function to make a heap from all the vectors of documents in the
   big data structure we've built.  */
bow_dv_heap *
bow_make_dv_heap_from_wi2dvf (bow_wi2dvf *wi2dvf)
{
  return bow_make_dv_heap_from_wi2dvf_hidden (wi2dvf, 0);
}

/* Free a heap.  Seldom needs to be called from outside this function
   since it is done automatically by the bow_*_next_wv() functions. */
void
bow_dv_heap_free (bow_dv_heap *heap)
{
  bow_free (heap);
}