1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
|
# frozen-string-literal: true
#
# The async_thread_pool extension adds support for running database
# queries in a separate threads using a thread pool. With the following
# code
#
# DB.extension :async_thread_pool
# foos = DB[:foos].async.where(name: 'A'..'M').all
# bar_names = DB[:bar].async.select_order_map(:name)
# baz_1 = DB[:bazes].async.first(id: 1)
#
# All 3 queries will be run in separate threads. +foos+, +bar_names+
# and +baz_1+ will be proxy objects. Calling a method on the proxy
# object will wait for the query to be run, and will return the result
# of calling that method on the result of the query method. For example,
# if you run:
#
# foos = DB[:foos].async.where(name: 'A'..'M').all
# bar_names = DB[:bars].async.select_order_map(:name)
# baz_1 = DB[:bazes].async.first(id: 1)
# sleep(1)
# foos.size
# bar_names.first
# baz_1.name
#
# These three queries will generally be run concurrently in separate
# threads. If you instead run:
#
# DB[:foos].async.where(name: 'A'..'M').all.size
# DB[:bars].async.select_order_map(:name).first
# DB[:bazes].async.first(id: 1).name
#
# Then will run each query sequentially, since you need the result of
# one query before running the next query. The queries will still be
# run in separate threads (by default).
#
# What is run in the separate thread is the entire method call that
# returns results. So with the original example:
#
# foos = DB[:foos].async.where(name: 'A'..'M').all
# bar_names = DB[:bars].async.select_order_map(:name)
# baz_1 = DB[:bazes].async.first(id: 1)
#
# The +all+, <tt>select_order_map(:name)</tt>, and <tt>first(id: 1)</tt>
# calls are run in separate threads. If a block is passed to a method
# such as +all+ or +each+, the block is also run in that thread. If you
# have code such as:
#
# h = {}
# DB[:foos].async.each{|row| h[row[:id]] = row}
# bar_names = DB[:bars].async.select_order_map(:name)
# p h
#
# You may end up with it printing an empty hash or partial hash, because the
# async +each+ call will not have run or finished running. Since the
# <tt>p h</tt> code relies on a side-effect of the +each+ block and not the
# return value of the +each+ call, it will not wait for the loading.
#
# You should avoid using +async+ for any queries where you are ignoring the
# return value, as otherwise you have no way to wait for the query to be run.
#
# Datasets that use async will use async threads to load data for the majority
# of methods that can return data. However, dataset methods that return
# enumerators will not use an async thread (e.g. calling # Dataset#map
# without a block or arguments does not use an async thread or return a
# proxy object).
#
# Because async methods (including their blocks) run in a separate thread, you
# should not use control flow modifiers such as +return+ or +break+ in async
# queries. Doing so will result in a error.
#
# Because async results are returned as proxy objects, it's a bad idea
# to use them in a boolean setting:
#
# result = DB[:foo].async.get(:boolean_column)
# # or:
# result = DB[:foo].async.first
#
# # ...
# if result
# # will always execute this banch, since result is a proxy object
# end
#
# In this case, you can call the +__value+ method to return the actual
# result:
#
# if result.__value
# # will not execute this branch if the dataset method returned nil or false
# end
#
# Similarly, because a proxy object is used, you should be careful using the
# result in a case statement or an argument to <tt>Class#===</tt>:
#
# # ...
# case result
# when Hash, true, false
# # will never take this branch, since result is a proxy object
# end
#
# Similar to usage in an +if+ statement, you should use +__value+:
#
# case result.__value
# when Hash, true, false
# # will never take this branch, since result is a proxy object
# end
#
# On Ruby 2.2+, you can use +itself+ instead of +__value+. It's preferable to
# use +itself+ if you can, as that will allow code to work with both proxy
# objects and regular objects.
#
# Because separate threads and connections are used for async queries,
# they do not use any state on the current connection/thread. So if
# you do:
#
# DB.transaction{DB[:table].async.all}
#
# Be aware that the transaction runs on one connection, and the SELECT
# query on a different connection. If you use currently using
# transactional testing (running each test inside a transaction/savepoint),
# and want to start using this extension, you should first switch to
# non-transactional testing of the code that will use the async thread
# pool before using this extension, as otherwise the use of
# <tt>Dataset#async</tt> will likely break your tests.
#
# If you are using Database#synchronize to checkout a connection, the
# same issue applies, where the async query runs on a different
# connection:
#
# DB.synchronize{DB[:table].async.all}
#
# Similarly, if you are using the server_block extension, any async
# queries inside with_server blocks will not use the server specified:
#
# DB.with_server(:shard1) do
# DB[:a].all # Uses shard1
# DB[:a].async.all # Uses default shard
# end
#
# You need to manually specify the shard for any dataset using an async
# query:
#
# DB.with_server(:shard1) do
# DB[:a].all # Uses shard1
# DB[:a].async.server(:shard1).all # Uses shard1
# end
#
# When the async_thread_pool extension, the size of the async thread pool
# can be set by using the +:num_async_threads+ Database option, which must
# be set before loading the async_thread_pool extension. This defaults
# to the size of the Database object's connection pool.
#
# By default, for consistent behavior, the async_thread_pool extension
# will always run the query in a separate thread. However, in some cases,
# such as when the async thread pool is busy and the results of a query
# are needed right away, it can improve performance to allow preemption,
# so that the query will run in the current thread instead of waiting
# for an async thread to become available. With the following code:
#
# foos = DB[:foos].async.where(name: 'A'..'M').all
# bar_names = DB[:bar].async.select_order_map(:name)
# if foos.length > 4
# baz_1 = DB[:bazes].async.first(id: 1)
# end
#
# Whether you need the +baz_1+ variable depends on the value of foos.
# If the async thread pool is busy, and by the time the +foos.length+
# call is made, the async thread pool has not started the processing
# to get the +foos+ value, it can improve performance to start that
# processing in the current thread, since it is needed immediately to
# determine whether to schedule query to get the +baz_1+ variable.
# The default is to not allow preemption, because if the current
# thread is used, it may have already checked out a connection that
# could be used, and that connection could be inside a transaction or
# have some other manner of connection-specific state applied to it.
# If you want to allow preemption, you can set the
# +:preempt_async_thread+ Database option before loading the
# async_thread_pool extension.
#
# Related module: Sequel::Database::AsyncThreadPool::DatasetMethods
#
module Sequel
module Database::AsyncThreadPool
# JobProcessor is a wrapper around a single thread, that will
# process a queue of jobs until it is shut down.
class JobProcessor # :nodoc:
def self.create_finalizer(queue, pool)
proc{run_finalizer(queue, pool)}
end
def self.run_finalizer(queue, pool)
# Push a nil for each thread using the queue, signalling
# that thread to close.
pool.each{queue.push(nil)}
# Join each of the closed threads.
pool.each(&:join)
# Clear the thread pool. Probably not necessary, but this allows
# for a simple way to check whether this finalizer has been run.
pool.clear
nil
end
private_class_method :run_finalizer
def initialize(queue)
@thread = ::Thread.new do
while proxy = queue.pop
proxy.__send__(:__run)
end
end
end
# Join the thread, should only be called by the related finalizer.
def join
@thread.join
end
end
# Wrapper for exception instances raised by async jobs. The
# wrapped exception will be raised by the code getting the value
# of the job.
WrappedException = Struct.new(:exception)
# Base proxy object class for jobs processed by async threads and
# the returned result.
class BaseProxy < BasicObject
# Store a block that returns the result when called.
def initialize(&block)
::Kernel.raise Error, "must provide block for an async job" unless block
@block = block
end
# Pass all method calls to the returned result.
def method_missing(*args, &block)
__value.public_send(*args, &block)
end
# :nocov:
ruby2_keywords(:method_missing) if respond_to?(:ruby2_keywords, true)
# :nocov:
# Delegate respond_to? calls to the returned result.
def respond_to_missing?(*args)
__value.respond_to?(*args)
end
# Override some methods defined by default so they apply to the
# returned result and not the current object.
[:!, :==, :!=, :instance_eval, :instance_exec].each do |method|
define_method(method) do |*args, &block|
__value.public_send(method, *args, &block)
end
end
# Wait for the value to be loaded if it hasn't already been loaded.
# If the code to load the return value raised an exception that was
# wrapped, reraise the exception.
def __value
unless defined?(@value)
__get_value
end
if @value.is_a?(WrappedException)
::Kernel.raise @value
end
@value
end
private
# Run the block and return the block value. If the block call raises
# an exception, wrap the exception.
def __run_block
# This may not catch concurrent calls (unless surrounded by a mutex), but
# it's not worth trying to protect against that. It's enough to just check for
# multiple non-concurrent calls.
::Kernel.raise Error, "Cannot run async block multiple times" unless block = @block
@block = nil
begin
block.call
rescue ::Exception => e
WrappedException.new(e)
end
end
end
# Default object class for async job/proxy result. This uses a queue for
# synchronization. The JobProcessor will push a result until the queue,
# and the code to get the value will pop the result from that queue (and
# repush the result to handle thread safety).
class Proxy < BaseProxy
def initialize
super
@queue = ::Queue.new
end
private
def __run
@queue.push(__run_block)
end
def __get_value
@value = @queue.pop
# Handle thread-safety by repushing the popped value, so that
# concurrent calls will receive the same value
@queue.push(@value)
end
end
# Object class for async job/proxy result when the :preempt_async_thread
# Database option is used. Uses a mutex for synchronization, and either
# the JobProcessor or the calling thread can run code to get the value.
class PreemptableProxy < BaseProxy
def initialize
super
@mutex = ::Mutex.new
end
private
def __get_value
@mutex.synchronize do
unless defined?(@value)
@value = __run_block
end
end
end
alias __run __get_value
end
module DatabaseMethods
def self.extended(db)
db.instance_exec do
unless pool.pool_type == :threaded || pool.pool_type == :sharded_threaded
raise Error, "can only load async_thread_pool extension if using threaded or sharded_threaded connection pool"
end
num_async_threads = opts[:num_async_threads] ? typecast_value_integer(opts[:num_async_threads]) : (Integer(opts[:max_connections] || 4))
raise Error, "must have positive number for num_async_threads" if num_async_threads <= 0
proxy_klass = typecast_value_boolean(opts[:preempt_async_thread]) ? PreemptableProxy : Proxy
define_singleton_method(:async_job_class){proxy_klass}
queue = @async_thread_queue = Queue.new
pool = @async_thread_pool = num_async_threads.times.map{JobProcessor.new(queue)}
ObjectSpace.define_finalizer(db, JobProcessor.create_finalizer(queue, pool))
extend_datasets(DatasetMethods)
end
end
private
# Wrap the block in a job/proxy object and schedule it to run using the async thread pool.
def async_run(&block)
proxy = async_job_class.new(&block)
@async_thread_queue.push(proxy)
proxy
end
end
ASYNC_METHODS = ([:all?, :any?, :drop, :entries, :grep_v, :include?, :inject, :member?, :minmax, :none?, :one?, :reduce, :sort, :take, :tally, :to_a, :to_h, :uniq, :zip] & Enumerable.instance_methods) + (Dataset::ACTION_METHODS - [:map, :paged_each])
ASYNC_BLOCK_METHODS = ([:collect, :collect_concat, :detect, :drop_while, :each_cons, :each_entry, :each_slice, :each_with_index, :each_with_object, :filter_map, :find, :find_all, :find_index, :flat_map, :max_by, :min_by, :minmax_by, :partition, :reject, :reverse_each, :sort_by, :take_while] & Enumerable.instance_methods) + [:paged_each]
ASYNC_ARGS_OR_BLOCK_METHODS = [:map]
module DatasetMethods
# Define an method in the given module that will run the given method using an async thread
# if the current dataset is async.
def self.define_async_method(mod, method)
mod.send(:define_method, method) do |*args, &block|
if @opts[:async]
ds = sync
db.send(:async_run){ds.send(method, *args, &block)}
else
super(*args, &block)
end
end
end
# Define an method in the given module that will run the given method using an async thread
# if the current dataset is async and a block is provided.
def self.define_async_block_method(mod, method)
mod.send(:define_method, method) do |*args, &block|
if block && @opts[:async]
ds = sync
db.send(:async_run){ds.send(method, *args, &block)}
else
super(*args, &block)
end
end
end
# Define an method in the given module that will run the given method using an async thread
# if the current dataset is async and arguments or a block is provided.
def self.define_async_args_or_block_method(mod, method)
mod.send(:define_method, method) do |*args, &block|
if (block || !args.empty?) && @opts[:async]
ds = sync
db.send(:async_run){ds.send(method, *args, &block)}
else
super(*args, &block)
end
end
end
# Override all of the methods that return results to do the processing in an async thread
# if they have been marked to run async and should run async (i.e. they don't return an
# Enumerator).
ASYNC_METHODS.each{|m| define_async_method(self, m)}
ASYNC_BLOCK_METHODS.each{|m| define_async_block_method(self, m)}
ASYNC_ARGS_OR_BLOCK_METHODS.each{|m| define_async_args_or_block_method(self, m)}
# Return a cloned dataset that will load results using the async thread pool.
def async
cached_dataset(:_async) do
clone(:async=>true)
end
end
# Return a cloned dataset that will not load results using the async thread pool.
# Only used if the current dataset has been marked as using the async thread pool.
def sync
cached_dataset(:_sync) do
clone(:async=>false)
end
end
end
end
Database.register_extension(:async_thread_pool, Database::AsyncThreadPool::DatabaseMethods)
end
|