1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
|
/*!
* lunr.Builder
* Copyright (C) @YEAR Oliver Nightingale
*/
/**
* lunr.Builder performs indexing on a set of documents and
* returns instances of lunr.Index ready for querying.
*
* All configuration of the index is done via the builder, the
* fields to index, the document reference, the text processing
* pipeline and document scoring parameters are all set on the
* builder before indexing.
*
* @constructor
* @property {string} _ref - Internal reference to the document reference field.
* @property {string[]} _fields - Internal reference to the document fields to index.
* @property {object} invertedIndex - The inverted index maps terms to document fields.
* @property {object} documentTermFrequencies - Keeps track of document term frequencies.
* @property {object} documentLengths - Keeps track of the length of documents added to the index.
* @property {lunr.tokenizer} tokenizer - Function for splitting strings into tokens for indexing.
* @property {lunr.Pipeline} pipeline - The pipeline performs text processing on tokens before indexing.
* @property {lunr.Pipeline} searchPipeline - A pipeline for processing search terms before querying the index.
* @property {number} documentCount - Keeps track of the total number of documents indexed.
* @property {number} _b - A parameter to control field length normalization, setting this to 0 disabled normalization, 1 fully normalizes field lengths, the default value is 0.75.
* @property {number} _k1 - A parameter to control how quickly an increase in term frequency results in term frequency saturation, the default value is 1.2.
* @property {number} termIndex - A counter incremented for each unique term, used to identify a terms position in the vector space.
* @property {array} metadataWhitelist - A list of metadata keys that have been whitelisted for entry in the index.
*/
lunr.Builder = function () {
this._ref = "id"
this._fields = Object.create(null)
this._documents = Object.create(null)
this.invertedIndex = Object.create(null)
this.fieldTermFrequencies = {}
this.fieldLengths = {}
this.tokenizer = lunr.tokenizer
this.pipeline = new lunr.Pipeline
this.searchPipeline = new lunr.Pipeline
this.documentCount = 0
this._b = 0.75
this._k1 = 1.2
this.termIndex = 0
this.metadataWhitelist = []
}
/**
* Sets the document field used as the document reference. Every document must have this field.
* The type of this field in the document should be a string, if it is not a string it will be
* coerced into a string by calling toString.
*
* The default ref is 'id'.
*
* The ref should _not_ be changed during indexing, it should be set before any documents are
* added to the index. Changing it during indexing can lead to inconsistent results.
*
* @param {string} ref - The name of the reference field in the document.
*/
lunr.Builder.prototype.ref = function (ref) {
this._ref = ref
}
/**
* A function that is used to extract a field from a document.
*
* Lunr expects a field to be at the top level of a document, if however the field
* is deeply nested within a document an extractor function can be used to extract
* the right field for indexing.
*
* @callback fieldExtractor
* @param {object} doc - The document being added to the index.
* @returns {?(string|object|object[])} obj - The object that will be indexed for this field.
* @example <caption>Extracting a nested field</caption>
* function (doc) { return doc.nested.field }
*/
/**
* Adds a field to the list of document fields that will be indexed. Every document being
* indexed should have this field. Null values for this field in indexed documents will
* not cause errors but will limit the chance of that document being retrieved by searches.
*
* All fields should be added before adding documents to the index. Adding fields after
* a document has been indexed will have no effect on already indexed documents.
*
* Fields can be boosted at build time. This allows terms within that field to have more
* importance when ranking search results. Use a field boost to specify that matches within
* one field are more important than other fields.
*
* @param {string} fieldName - The name of a field to index in all documents.
* @param {object} attributes - Optional attributes associated with this field.
* @param {number} [attributes.boost=1] - Boost applied to all terms within this field.
* @param {fieldExtractor} [attributes.extractor] - Function to extract a field from a document.
* @throws {RangeError} fieldName cannot contain unsupported characters '/'
*/
lunr.Builder.prototype.field = function (fieldName, attributes) {
if (/\//.test(fieldName)) {
throw new RangeError ("Field '" + fieldName + "' contains illegal character '/'")
}
this._fields[fieldName] = attributes || {}
}
/**
* A parameter to tune the amount of field length normalisation that is applied when
* calculating relevance scores. A value of 0 will completely disable any normalisation
* and a value of 1 will fully normalise field lengths. The default is 0.75. Values of b
* will be clamped to the range 0 - 1.
*
* @param {number} number - The value to set for this tuning parameter.
*/
lunr.Builder.prototype.b = function (number) {
if (number < 0) {
this._b = 0
} else if (number > 1) {
this._b = 1
} else {
this._b = number
}
}
/**
* A parameter that controls the speed at which a rise in term frequency results in term
* frequency saturation. The default value is 1.2. Setting this to a higher value will give
* slower saturation levels, a lower value will result in quicker saturation.
*
* @param {number} number - The value to set for this tuning parameter.
*/
lunr.Builder.prototype.k1 = function (number) {
this._k1 = number
}
/**
* Adds a document to the index.
*
* Before adding fields to the index the index should have been fully setup, with the document
* ref and all fields to index already having been specified.
*
* The document must have a field name as specified by the ref (by default this is 'id') and
* it should have all fields defined for indexing, though null or undefined values will not
* cause errors.
*
* Entire documents can be boosted at build time. Applying a boost to a document indicates that
* this document should rank higher in search results than other documents.
*
* @param {object} doc - The document to add to the index.
* @param {object} attributes - Optional attributes associated with this document.
* @param {number} [attributes.boost=1] - Boost applied to all terms within this document.
*/
lunr.Builder.prototype.add = function (doc, attributes) {
var docRef = doc[this._ref],
fields = Object.keys(this._fields)
this._documents[docRef] = attributes || {}
this.documentCount += 1
for (var i = 0; i < fields.length; i++) {
var fieldName = fields[i],
extractor = this._fields[fieldName].extractor,
field = extractor ? extractor(doc) : doc[fieldName],
tokens = this.tokenizer(field, {
fields: [fieldName]
}),
terms = this.pipeline.run(tokens),
fieldRef = new lunr.FieldRef (docRef, fieldName),
fieldTerms = Object.create(null)
this.fieldTermFrequencies[fieldRef] = fieldTerms
this.fieldLengths[fieldRef] = 0
// store the length of this field for this document
this.fieldLengths[fieldRef] += terms.length
// calculate term frequencies for this field
for (var j = 0; j < terms.length; j++) {
var term = terms[j]
if (fieldTerms[term] == undefined) {
fieldTerms[term] = 0
}
fieldTerms[term] += 1
// add to inverted index
// create an initial posting if one doesn't exist
if (this.invertedIndex[term] == undefined) {
var posting = Object.create(null)
posting["_index"] = this.termIndex
this.termIndex += 1
for (var k = 0; k < fields.length; k++) {
posting[fields[k]] = Object.create(null)
}
this.invertedIndex[term] = posting
}
// add an entry for this term/fieldName/docRef to the invertedIndex
if (this.invertedIndex[term][fieldName][docRef] == undefined) {
this.invertedIndex[term][fieldName][docRef] = Object.create(null)
}
// store all whitelisted metadata about this token in the
// inverted index
for (var l = 0; l < this.metadataWhitelist.length; l++) {
var metadataKey = this.metadataWhitelist[l],
metadata = term.metadata[metadataKey]
if (this.invertedIndex[term][fieldName][docRef][metadataKey] == undefined) {
this.invertedIndex[term][fieldName][docRef][metadataKey] = []
}
this.invertedIndex[term][fieldName][docRef][metadataKey].push(metadata)
}
}
}
}
/**
* Calculates the average document length for this index
*
* @private
*/
lunr.Builder.prototype.calculateAverageFieldLengths = function () {
var fieldRefs = Object.keys(this.fieldLengths),
numberOfFields = fieldRefs.length,
accumulator = {},
documentsWithField = {}
for (var i = 0; i < numberOfFields; i++) {
var fieldRef = lunr.FieldRef.fromString(fieldRefs[i]),
field = fieldRef.fieldName
documentsWithField[field] || (documentsWithField[field] = 0)
documentsWithField[field] += 1
accumulator[field] || (accumulator[field] = 0)
accumulator[field] += this.fieldLengths[fieldRef]
}
var fields = Object.keys(this._fields)
for (var i = 0; i < fields.length; i++) {
var fieldName = fields[i]
accumulator[fieldName] = accumulator[fieldName] / documentsWithField[fieldName]
}
this.averageFieldLength = accumulator
}
/**
* Builds a vector space model of every document using lunr.Vector
*
* @private
*/
lunr.Builder.prototype.createFieldVectors = function () {
var fieldVectors = {},
fieldRefs = Object.keys(this.fieldTermFrequencies),
fieldRefsLength = fieldRefs.length,
termIdfCache = Object.create(null)
for (var i = 0; i < fieldRefsLength; i++) {
var fieldRef = lunr.FieldRef.fromString(fieldRefs[i]),
fieldName = fieldRef.fieldName,
fieldLength = this.fieldLengths[fieldRef],
fieldVector = new lunr.Vector,
termFrequencies = this.fieldTermFrequencies[fieldRef],
terms = Object.keys(termFrequencies),
termsLength = terms.length
var fieldBoost = this._fields[fieldName].boost || 1,
docBoost = this._documents[fieldRef.docRef].boost || 1
for (var j = 0; j < termsLength; j++) {
var term = terms[j],
tf = termFrequencies[term],
termIndex = this.invertedIndex[term]._index,
idf, score, scoreWithPrecision
if (termIdfCache[term] === undefined) {
idf = lunr.idf(this.invertedIndex[term], this.documentCount)
termIdfCache[term] = idf
} else {
idf = termIdfCache[term]
}
score = idf * ((this._k1 + 1) * tf) / (this._k1 * (1 - this._b + this._b * (fieldLength / this.averageFieldLength[fieldName])) + tf)
score *= fieldBoost
score *= docBoost
scoreWithPrecision = Math.round(score * 1000) / 1000
// Converts 1.23456789 to 1.234.
// Reducing the precision so that the vectors take up less
// space when serialised. Doing it now so that they behave
// the same before and after serialisation. Also, this is
// the fastest approach to reducing a number's precision in
// JavaScript.
fieldVector.insert(termIndex, scoreWithPrecision)
}
fieldVectors[fieldRef] = fieldVector
}
this.fieldVectors = fieldVectors
}
/**
* Creates a token set of all tokens in the index using lunr.TokenSet
*
* @private
*/
lunr.Builder.prototype.createTokenSet = function () {
this.tokenSet = lunr.TokenSet.fromArray(
Object.keys(this.invertedIndex).sort()
)
}
/**
* Builds the index, creating an instance of lunr.Index.
*
* This completes the indexing process and should only be called
* once all documents have been added to the index.
*
* @returns {lunr.Index}
*/
lunr.Builder.prototype.build = function () {
this.calculateAverageFieldLengths()
this.createFieldVectors()
this.createTokenSet()
return new lunr.Index({
invertedIndex: this.invertedIndex,
fieldVectors: this.fieldVectors,
tokenSet: this.tokenSet,
fields: Object.keys(this._fields),
pipeline: this.searchPipeline
})
}
/**
* Applies a plugin to the index builder.
*
* A plugin is a function that is called with the index builder as its context.
* Plugins can be used to customise or extend the behaviour of the index
* in some way. A plugin is just a function, that encapsulated the custom
* behaviour that should be applied when building the index.
*
* The plugin function will be called with the index builder as its argument, additional
* arguments can also be passed when calling use. The function will be called
* with the index builder as its context.
*
* @param {Function} plugin The plugin to apply.
*/
lunr.Builder.prototype.use = function (fn) {
var args = Array.prototype.slice.call(arguments, 1)
args.unshift(this)
fn.apply(this, args)
}
|