1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
|
#ifndef __EXPORT_H__
#define __EXPORT_H__
#include <string>
#include <vector>
/**
* \addtogroup swig_interface SimString SWIG interface
* @{
*
* The SimString SWIG interface.
*/
/**
* Similarity measures.
*/
enum {
/// Exact matching.
exact,
/// Dice coefficient.
dice,
/// Cosine coefficient.
cosine,
/// Jaccard coefficient.
jaccard,
/// Overlap coefficient.
overlap,
};
/**
* SimString database writer.
*/
class writer
{
protected:
void *m_dbw;
void *m_gen;
bool m_unicode;
public:
/**
* Creates a new database.
* This function creates an instance of SimString database writer
* for creating a new database. If this function failes to open
* the database, it throws SWIG_IOError.
*
* @param filename The database filename.
* @param n The unit of character n-grams.
* @param be \c true to represent a begin and end of strings
* in character n-grams.
* @param unicode \c true to use Unicode mode. In Unicode mode,
* wide (\c wchar_t) characters are used in n-grams.
* @throw SWIG_IOError
*/
writer(const char *filename, int n = 3, bool be = false, bool unicode = false);
/**
* Destructs the writer.
* Destructing a writer object automatically closes the database.
* @throw SWIG_IOError
*/
virtual ~writer();
/**
* Inserts a string into the database.
* @param string A string to be inserted to the database. This
* argument must be a null-terminated byte stream.
* If the database is created with Unicode mode, this
* function assumes that the byte stream is encoded in
* UTF-8, and converts it into a \c wchar_t string.
* @throw SWIG_IOError
*/
void insert(const char *string);
/**
* Closes the database.
* This function flushes and closes the database. If this function failes
* to close the database, it throws SWIG_IOError.
* @throw SWIG_IOError
*/
void close();
};
/**
* SimString database reader.
*/
class reader
{
protected:
void *m_dbr;
public:
/**
* Opens a database for retrieving strings.
* This function creates an instance of SimString database reader
* by opening an existing database. If this function failes to open
* the database, it throws SWIG_IOError.
*
* @param filename The database filename.
* @throw SWIG_IOError
*/
reader(const char *filename);
/**
* Destructs the database reader.
* Destructing the reader object automatically closes the database.
*/
virtual ~reader();
/**
* Retrieves strings that are similar to the query string.
* This function retrieves strings whose similarity with the query string
* are no smaller than a threshold. Before calling this function, set the
* similarity measure and threshold to \ref measure and \ref threshold
* attributes of the reader object.
*
* @param query The query string. This argument must be a
* null-terminated byte stream. If the database was
* created with Unicode mode, this function assumes
* that the byte stream is encoded in UTF-8, and
* converts it into a wchar_t string.
* @return The array of strings retrieved for the query.
* If the database was created with Unicode mode,
* this function returns strings in UTF-8.
* @see measure The similarity function used by this function.
* @see threshold The similarity value used by this function.
*/
std::vector<std::string> retrieve(const char *query);
/**
* Closes a database.
*/
void close();
public:
/**
* Similarity measure.
* Specify a similarity measure for approximate string retrieval used
* by retrieve() function.
* @see exact, cosine, dice, jaccard, overlap
*/
int measure;
/**
* Threshold for the similarity measure.
* Specify a threshold for approximate string retrieval used by
* retrieve() function.
*/
double threshold;
};
/** @} */
/**
@mainpage SimString SWIG interface
@section intro Introduction
This document describes a SWIG interface that bridges SimString with various
programing languages including Python and Ruby. Although SimString currently
distribution provides SWIG wrappers for Python and Ruby, it may be easy to
build libraries for other languages.
SimString module provides two simple classes ::writer and ::reader.
In the ::writer class, one can create a SimString database using the
constructor writer::writer, and call the member function writer::insert for
inserting a string into the database.
In the ::reader class, one can open an existing SimString database with the
constructor reader::reader, specify a similarity measure and threshold with
two attributes reader::measure and reader::threshold, and call the member
function reader::retrieve for performing approximate string matching.
SimString module always uses 8-bit null-terminated byte streams in
writer::insert and reader::retrieve functions. The encoding of byte streams
can be arbitrary, but must be UTF-8 for a database in Unicode mode.
@section api Documentation
- @ref swig_interface "SWIG interface"
@section language Language-specific Notes
@subsection language-ruby Ruby
- Because of the naming convention, the initial letter of a name is
capitalized as follows:
the module name (Simstring), class names (Writer and Reader),
and similarity measures (Exact, Dice, Cosine, Jaccard, Overlap).
@section sample Sample Programs
@subsection python Python
A basic sample.
@include python/sample.py
A Unicode sample.
@include python/sample_unicode.py
@subsection ruby Ruby
A basic sample.
@include ruby/sample.rb
A Unicode sample.
@include ruby/sample_unicode.rb
*/
#endif/*__EXPORT_H__*/
|