1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <memory>
#include "db/range_tombstone_fragmenter.h"
#if USE_COROUTINES
#include "folly/coro/Coroutine.h"
#include "folly/coro/Task.h"
#endif
#include "rocksdb/slice_transform.h"
#include "rocksdb/table_reader_caller.h"
#include "table/get_context.h"
#include "table/internal_iterator.h"
#include "table/multiget_context.h"
namespace ROCKSDB_NAMESPACE {
class Iterator;
struct ParsedInternalKey;
class Slice;
class Arena;
struct ReadOptions;
struct TableProperties;
class GetContext;
class MultiGetContext;
// A Table (also referred to as SST) is a sorted map from strings to strings.
// Tables are immutable and persistent. A Table may be safely accessed from
// multiple threads without external synchronization. Table readers are used
// for reading various types of table formats supported by rocksdb including
// BlockBasedTable, PlainTable and CuckooTable format.
class TableReader {
public:
virtual ~TableReader() {}
// Returns a new iterator over the table contents.
// The result of NewIterator() is initially invalid (caller must
// call one of the Seek methods on the iterator before using it).
//
// read_options: Must outlive the returned iterator.
// arena: If not null, the arena needs to be used to allocate the Iterator.
// When destroying the iterator, the caller will not call "delete"
// but Iterator::~Iterator() directly. The destructor needs to destroy
// all the states but those allocated in arena.
// skip_filters: disables checking the bloom filters even if they exist. This
// option is effective only for block-based table format.
// compaction_readahead_size: its value will only be used if caller =
// kCompaction
virtual InternalIterator* NewIterator(
const ReadOptions& read_options, const SliceTransform* prefix_extractor,
Arena* arena, bool skip_filters, TableReaderCaller caller,
size_t compaction_readahead_size = 0,
bool allow_unprepared_value = false) = 0;
// read_options.snapshot needs to outlive this call.
virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
const ReadOptions& /*read_options*/) {
return nullptr;
}
virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
SequenceNumber /* read_seqno */, const Slice* /* timestamp */) {
return nullptr;
}
// Given a key, return an approximate byte offset in the file where
// the data for that key begins (or would begin if the key were
// present in the file). The returned value is in terms of file
// bytes, and so includes effects like compression of the underlying data.
// E.g., the approximate offset of the last key in the table will
// be close to the file length.
// TODO(peterd): Since this function is only used for approximate size
// from beginning of file, reduce code duplication by removing this
// function and letting ApproximateSize take optional start and end, so
// that absolute start and end can be specified and optimized without
// key / index work.
virtual uint64_t ApproximateOffsetOf(const ReadOptions& read_options,
const Slice& key,
TableReaderCaller caller) = 0;
// Given start and end keys, return the approximate data size in the file
// between the keys. The returned value is in terms of file bytes, and so
// includes effects like compression of the underlying data and applicable
// portions of metadata including filters and indexes. Nullptr for start or
// end (or both) indicates absolute start or end of the table.
virtual uint64_t ApproximateSize(const ReadOptions& read_options,
const Slice& start, const Slice& end,
TableReaderCaller caller) = 0;
struct Anchor {
Anchor(const Slice& _user_key, size_t _range_size)
: user_key(_user_key.ToStringView()), range_size(_range_size) {}
std::string user_key;
size_t range_size;
};
// Now try to return approximately 128 anchor keys.
// The last one tends to be the largest key.
virtual Status ApproximateKeyAnchors(const ReadOptions& /*read_options*/,
std::vector<Anchor>& /*anchors*/) {
return Status::NotSupported("ApproximateKeyAnchors() not supported.");
}
// Set up the table for Compaction. Might change some parameters with
// posix_fadvise
virtual void SetupForCompaction() = 0;
virtual std::shared_ptr<const TableProperties> GetTableProperties() const = 0;
// Prepare work that can be done before the real Get()
virtual void Prepare(const Slice& /*target*/) {}
// Report an approximation of how much memory has been used.
virtual size_t ApproximateMemoryUsage() const = 0;
// Calls get_context->SaveValue() repeatedly, starting with
// the entry found after a call to Seek(key), until it returns false.
// May not make such a call if filter policy says that key is not present.
//
// get_context->MarkKeyMayExist needs to be called when it is configured to be
// memory only and the key is not found in the block cache.
//
// readOptions is the options for the read
// key is the key to search for
// skip_filters: disables checking the bloom filters even if they exist. This
// option is effective only for block-based table format.
virtual Status Get(const ReadOptions& readOptions, const Slice& key,
GetContext* get_context,
const SliceTransform* prefix_extractor,
bool skip_filters = false) = 0;
// Use bloom filters in the table file, if present, to filter out keys. The
// mget_range will be updated to skip keys that get a negative result from
// the filter lookup.
virtual Status MultiGetFilter(const ReadOptions& /*readOptions*/,
const SliceTransform* /*prefix_extractor*/,
MultiGetContext::Range* /*mget_range*/) {
return Status::NotSupported();
}
virtual void MultiGet(const ReadOptions& readOptions,
const MultiGetContext::Range* mget_range,
const SliceTransform* prefix_extractor,
bool skip_filters = false) {
for (auto iter = mget_range->begin(); iter != mget_range->end(); ++iter) {
*iter->s = Get(readOptions, iter->ikey, iter->get_context,
prefix_extractor, skip_filters);
}
}
#if USE_COROUTINES
virtual folly::coro::Task<void> MultiGetCoroutine(
const ReadOptions& readOptions, const MultiGetContext::Range* mget_range,
const SliceTransform* prefix_extractor, bool skip_filters = false) {
MultiGet(readOptions, mget_range, prefix_extractor, skip_filters);
co_return;
}
#endif // USE_COROUTINES
// Prefetch data corresponding to a give range of keys
// Typically this functionality is required for table implementations that
// persists the data on a non volatile storage medium like disk/SSD
virtual Status Prefetch(const ReadOptions& /* read_options */,
const Slice* begin = nullptr,
const Slice* end = nullptr) {
(void)begin;
(void)end;
// Default implementation is NOOP.
// The child class should implement functionality when applicable
return Status::OK();
}
// convert db file to a human readable form
virtual Status DumpTable(WritableFile* /*out_file*/) {
return Status::NotSupported("DumpTable() not supported");
}
// check whether there is corruption in this db file
virtual Status VerifyChecksum(const ReadOptions& /*read_options*/,
TableReaderCaller /*caller*/) {
return Status::NotSupported("VerifyChecksum() not supported");
}
// Tell the reader that the file should now be obsolete, e.g. as a hint
// to delete relevant cache entries on destruction. (It might not be safe
// to "unpin" cache entries until destruction time.) NOTE: must be thread
// safe because multiple table cache references might all mark this file as
// obsolete when they are released (the last of which destroys this reader).
virtual void MarkObsolete(uint32_t /*uncache_aggressiveness*/) {
// no-op as default
}
};
} // namespace ROCKSDB_NAMESPACE
|