File: CAcSQLInvertedFile.h

package info (click to toggle)
gnuift 0.1.14%2Bds-1
  • links: PTS
  • area: main
  • in suites: stretch
  • size: 5,632 kB
  • ctags: 2,973
  • sloc: cpp: 15,867; sh: 8,281; ansic: 1,812; perl: 1,007; php: 651; makefile: 483; lisp: 344
file content (247 lines) | stat: -rw-r--r-- 7,301 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
/* -*- mode: c++ -*- 
*/
/* 

    GIFT, a flexible content based image retrieval system.
    Copyright (C) 1998, 1999, 2000, 2001, 2002, CUI University of Geneva

     Copyright (C) 2003, 2004 Bayreuth University
      2005 Bamberg University
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

*/
// -*- mode: c++ -*-


class CXMLElement;

/**
*
* 
*  This class manages the access to the inverted file as well 
*    as its generation
*
*
*
* modification history:
*
* WM 150600 created the file
*
*
*
* compiler defines used:
*
*
*/

#ifndef _CINVERTEDFILEACCESSOR
#define _CINVERTEDFILEACCESSOR
#include "libGIFTAcInvertedFile/include/uses-declarations.h"
#include <string>
#include "libMRML/include/TID.h"
#include "libMRML/include/CSelfDestroyPointer.h"
#include "libMRML/include/CArraySelfDestroyPointer.h"
#include "libGIFTAcInvertedFile/include/CDocumentFrequencyList.h"
#include "CCollectionFrequencyList.h"
#include "libGIFTAcInvertedFile/include/CADIHash.h"
#include "libGIFTAcURL2FTS/include/CAcURL2FTS.h"
#include <iostream>
#include <fstream>
#include <map>
#include <vector>
#ifdef HAS_HASH_MAP
#include <hash_map>
#else
#define hash_map map
#endif
#include <functional>
#include <algorithm>

#include "libMRML/include/CMagic.h"


typedef TID TFeatureID ;

/**
   An accessor to an inverted file. This access is done
   "by hand" at present this not really efficient, however
   we plan to move to memory mapped files. 

 */
class CAcInvertedFile:public CAcURL2FTS{  

protected:
  /** the maximum feature ID arising in this file */
  TID mMaximumFeatureID;
  /** A buffer, if the inverted file is to be 
      held in ram */
  CArraySelfDestroyPointer<char> mInvertedFileBuffer;
  /** The inverted file */
  mutable CSelfDestroyPointer<istream> mInvertedFile;

  /** Feature -> Offset in inverted file */
  mutable ifstream mOffsetFile;

  /** File of feature descriptions */
  ifstream mFeatureDescriptionFile;

  /** Name of the inverted file */
  string mInvertedFileName;

  /** Name of the Offset file */
  string mOffsetFileName;

  /** Name for the file with the feature description */
  string mFeatureDescriptionFileName;

  /** map from feature id to the offset for this feature */
  typedef hash_map<TID,unsigned int> CIDToOffset;//new hash
  /** map from feature id to the offset for this feature */
  CIDToOffset mIDToOffset;

  /** map from feature to the collection frequency */
  mutable hash_map<TID,double> mFeatureToCollectionFrequency;//new hash

  /**@name for fast access...*/
  //@{
  /**  map from the feature ID to the feature description */
  hash_map<TID,unsigned int> mFeatureDescription;//new hash_

  /**  additional information about the document like, e.g.
       the euclidean length of the feature list.
   */
  CADIHash mDocumentInformation;
  //@}
  /** add a pair of FeatureID,Offset to the open offset file 
      (helper function for inverted file construction)
   */
  void writeOffsetFileElement(TID inFeatureID,
			      int inPosition,
			      ostream& inOpenOffsetFile);
  /** loads a *.fts file. and returns the feature list*/
  CDocumentFrequencyList* getFeatureFile(string inFileName)const;
public:
  /** for testing if the inverted file is correctly constructed*/
  bool operator()()const;

  /**  This opens an exsisting inverted file, and then 
       inits this structure. After that it is fully
       usable 

       As a paramter it takes an XMLElement which contains
       a "collection" element and its content.

       If the attribute vi-generate-inverted-file is true,
       then a new inverted file will be generated using
       the parameters given in inCollectionElement. you will
       NOT be able to use *this afterwards.

       The REAL constructor.
  */
  CAcInvertedFile(const CXMLElement& inCollectionElement);
  /**  called by constructors */
  bool init(bool);

  /** Destructor */
  ~CAcInvertedFile();
  
  /** Translate a DocumentID to a URL (for output) */
  string IDToURL(TID inID)const;

  /** Translate an URL to its document ID */
  TID URLToID(const string& inURL)const;
  
  /**@name The proper inverted file access*/
  //@{
  /** List of documents containing the feature */
  CDocumentFrequencyList* FeatureToList(TFeatureID)const;

  /** List of features contained by a document */
  CDocumentFrequencyList* URLToFeatureList(string inURL)const;

  /** List of features contained by a document with ID inDID */
  CDocumentFrequencyList* DIDToFeatureList(TID inDID)const;

  //@}


  /**@name Accessing information about features*/
  //@{
  /** Collection frequency for a given feature */
  double FeatureToCollectionFrequency(TFeatureID)const;

  /** What kind of feature is the feature with ID inFeatureID? */
  unsigned int getFeatureDescription(TID inFeatureID)const;
  //@}

  /**@name Accessing additional document information*/
  //@{
  /**  returns the maximum document frequency for one document ID */
  double DIDToMaxDocumentFrequency(TID)const;

  /**  Returns the document-frequency square sum for a given document ID */
  double DIDToDFSquareSum(TID)const;

  /**  Returns this function for a given document ID */
  double DIDToSquareDFLogICFSum(TID)const;
  //@}

  /*@name Inverted File Generation and Consistency Checking*/
  //@{

  /** Generating an inverted File, if there is none.
      Fast but stupid in-memory method. This method is 
      very fast, if all the inverted file (and a bit more)
      can be kept in memory at runtime. If this is not the 
      case, extensive swapping is the result, virtually halting
      the inverted file creation.
   */
  bool generateInvertedFile();

  /** Generating an inverted File, if there is none.
      
      Employing the two-way-merge method described
      in "managing gigabytes", chapter 5.2. Sort-based
      inversion. (Page 181)

   */
  bool newGenerateInvertedFile();

  /**Check the consistency of the inverted file system accessed
     by this accessor.*/
  bool checkConsistency();

  /**Is the Document with inDocumentID contained in the 
    document frequency list of the feature inFeatureID and
    is the associated document frequency the same?*/
  bool findWithinStream(TID inFeatureID,
			TID inDocumentID,
			double inDocumentFrequency)const;
  
  //@}

  /** This is interesting for browsing*/
  TID getMaximumFeatureID()const;
  /** Getting a list of all features contained in this.
      This function is necessary, because in the present 
      system only about 50 percent of the features are 
      really used.

      A feature is considered used if it arises in mIDToOffset.
   */
  list<TID>* getAllFeatureIDs()const;
};

#endif