1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
|
#pragma once
#include "common.h"
#include "preset.h"
#include "server-common.h"
#include "server-http.h"
#include <mutex>
#include <condition_variable>
#include <functional>
#include <memory>
#include <set>
/**
* state diagram:
*
* UNLOADED ──► LOADING ──► LOADED
* ▲ │ │
* └───failed───┘ │
* ▲ │
* └────────unloaded─────────┘
*/
enum server_model_status {
// TODO: also add downloading state when the logic is added
SERVER_MODEL_STATUS_UNLOADED,
SERVER_MODEL_STATUS_LOADING,
SERVER_MODEL_STATUS_LOADED
};
static server_model_status server_model_status_from_string(const std::string & status_str) {
if (status_str == "unloaded") {
return SERVER_MODEL_STATUS_UNLOADED;
}
if (status_str == "loading") {
return SERVER_MODEL_STATUS_LOADING;
}
if (status_str == "loaded") {
return SERVER_MODEL_STATUS_LOADED;
}
throw std::runtime_error("invalid server model status");
}
static std::string server_model_status_to_string(server_model_status status) {
switch (status) {
case SERVER_MODEL_STATUS_UNLOADED: return "unloaded";
case SERVER_MODEL_STATUS_LOADING: return "loading";
case SERVER_MODEL_STATUS_LOADED: return "loaded";
default: return "unknown";
}
}
struct server_model_meta {
common_preset preset;
std::string name;
int port = 0;
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
int64_t last_used = 0; // for LRU unloading
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
bool is_active() const {
return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
}
bool is_failed() const {
return status == SERVER_MODEL_STATUS_UNLOADED && exit_code != 0;
}
void update_args(common_preset_context & ctx_presets, std::string bin_path);
};
struct subprocess_s;
struct server_models {
private:
struct instance_t {
std::shared_ptr<subprocess_s> subproc; // shared between main thread and monitoring thread
std::thread th;
server_model_meta meta;
FILE * stdin_file = nullptr;
};
std::mutex mutex;
std::condition_variable cv;
std::map<std::string, instance_t> mapping;
// for stopping models
std::condition_variable cv_stop;
std::set<std::string> stopping_models;
common_preset_context ctx_preset;
common_params base_params;
std::string bin_path;
std::vector<std::string> base_env;
common_preset base_preset; // base preset from llama-server CLI args
void update_meta(const std::string & name, const server_model_meta & meta);
// unload least recently used models if the limit is reached
void unload_lru();
// not thread-safe, caller must hold mutex
void add_model(server_model_meta && meta);
public:
server_models(const common_params & params, int argc, char ** argv, char ** envp);
void load_models();
// check if a model instance exists (thread-safe)
bool has_model(const std::string & name);
// return a copy of model metadata (thread-safe)
std::optional<server_model_meta> get_meta(const std::string & name);
// return a copy of all model metadata (thread-safe)
std::vector<server_model_meta> get_all_meta();
// load and unload model instances
// these functions are thread-safe
void load(const std::string & name);
void unload(const std::string & name);
void unload_all();
// update the status of a model instance (thread-safe)
void update_status(const std::string & name, server_model_status status, int exit_code);
// wait until the model instance is fully loaded (thread-safe)
// return when the model is loaded or failed to load
void wait_until_loaded(const std::string & name);
// load the model if not loaded, otherwise do nothing (thread-safe)
// return false if model is already loaded; return true otherwise (meta may need to be refreshed)
bool ensure_model_loaded(const std::string & name);
// proxy an HTTP request to the model instance
server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);
// notify the router server that a model instance is ready
// return the monitoring thread (to be joined by the caller)
static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler);
};
struct server_models_routes {
common_params params;
json webui_settings = json::object();
server_models models;
server_models_routes(const common_params & params, int argc, char ** argv, char ** envp)
: params(params), models(params, argc, argv, envp) {
if (!this->params.webui_config_json.empty()) {
try {
webui_settings = json::parse(this->params.webui_config_json);
} catch (const std::exception & e) {
LOG_ERR("%s: failed to parse webui config: %s\n", __func__, e.what());
throw;
}
}
init_routes();
}
void init_routes();
// handlers using lambda function, so that they can capture `this` without `std::bind`
server_http_context::handler_t get_router_props;
server_http_context::handler_t proxy_get;
server_http_context::handler_t proxy_post;
server_http_context::handler_t get_router_models;
server_http_context::handler_t post_router_models_load;
server_http_context::handler_t post_router_models_unload;
};
/**
* A simple HTTP proxy that forwards requests to another server
* and relays the responses back.
*/
struct server_http_proxy : server_http_res {
std::function<void()> cleanup = nullptr;
public:
server_http_proxy(const std::string & method,
const std::string & host,
int port,
const std::string & path,
const std::map<std::string, std::string> & headers,
const std::string & body,
const std::function<bool()> should_stop,
int32_t timeout_read,
int32_t timeout_write
);
~server_http_proxy() {
if (cleanup) {
cleanup();
}
}
private:
std::thread thread;
struct msg_t {
std::map<std::string, std::string> headers;
int status = 0;
std::string data;
std::string content_type;
};
};
|