1
0
Fork 0
mirror of https://gitlab.com/niansa/libjustlm.git synced 2025-03-06 20:49:17 +01:00

Updated llama.cpp-mainline

This commit is contained in:
niansa 2023-10-04 22:13:48 +02:00
parent 215db6b9b7
commit fc5e4f5aa1
4 changed files with 27 additions and 16 deletions

View file

@ -80,7 +80,7 @@ public:
struct Params {
int seed = 0; // RNG seed
unsigned n_threads = 0;
unsigned n_threads = 0; // Amount of threads to use, immutable after Inference was constructed
unsigned n_ctx = 2024; // Context size
unsigned n_ctx_window_top_bar = 0; // Top bar of context window. Must be smaller than context size
unsigned n_batch = 8; // Batch size

View file

@ -36,14 +36,21 @@ class LLaMAInference final : public Inference {
auto lparams = llama_context_default_params();
lparams.seed = params.seed;
lparams.n_ctx = params.n_ctx = params.n_ctx>0?params.n_ctx:2024;
lparams.use_mlock = params.use_mlock;
lparams.n_gpu_layers = params.n_gpu_layers;
lparams.n_threads = params.n_threads;
//lparams.n_threads_batch = params.n_threads; TODO: Is this sane?
// Create context
state->model = llama_load_model_from_file(weights_path.c_str(), lparams);
// Get model parameters
auto mparams = llama_model_default_params();
mparams.use_mlock = params.use_mlock;
mparams.n_gpu_layers = params.n_gpu_layers;
// Load model
state->model = llama_load_model_from_file(weights_path.c_str(), mparams);
if (!state->model) {
LM_THROW("Failed to initialize llama model from file", LM_BOOL_ERROR);
}
// Create context
state->ctx = llama_new_context_with_model(state->model, lparams);
if (!state->ctx) {
LM_THROW("Failed to initialize llama context from model", LM_BOOL_ERROR);
@ -92,7 +99,8 @@ class LLaMAInference final : public Inference {
if (it + params.n_batch >= ssize_t(state->tokens.size())) break;
// Evaluate
if (llama_eval(state->ctx, state->tokens.data()+it, params.n_batch, it, params.n_threads)) {
const auto batch = llama_batch_get_one(state->tokens.data()+it, params.n_batch, it, 0);
if (llama_decode(state->ctx, batch)) {
LM_COTHROW("Failed to evaluate tokens in batches", LM_BOOL_ERROR);
}
@ -109,7 +117,8 @@ class LLaMAInference final : public Inference {
// Evaluate remaining tokens
if (it < state->tokens.size()) {
for (; it != state->tokens.size(); it++) {
if (llama_eval(state->ctx, state->tokens.data()+it, 1, it, params.n_threads)) {
const auto batch = llama_batch_get_one(state->tokens.data()+it, 1, it, 0);
if (llama_decode(state->ctx, batch)) {
LM_COTHROW("Failed to evaluate individual tokens", LM_BOOL_ERROR);
}
}
@ -131,7 +140,7 @@ class LLaMAInference final : public Inference {
int llama_sample_top_p_top_k() {
auto& state = get_state();
auto logits = llama_get_logits(state->ctx);
auto n_vocab = llama_n_vocab(state->ctx);
auto n_vocab = llama_n_vocab(state->model);
// Populate initial list of all candidates
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
@ -154,18 +163,18 @@ class LLaMAInference final : public Inference {
llama_sample_tail_free(state->ctx, &candidates_p, 1.0f, 1);
llama_sample_typical(state->ctx, &candidates_p, 1.0f, 1);
llama_sample_top_p(state->ctx, &candidates_p, params.top_p, 1);
llama_sample_temperature(state->ctx, &candidates_p, params.temp);
llama_sample_temp(state->ctx, &candidates_p, params.temp);
return accept_token(llama_sample_token(state->ctx, &candidates_p));
}
case 1: {
float mirostat_mu = 2.0f * params.mirostat_target_entropy;
const int mirostat_m = 100;
llama_sample_temperature(state->ctx, &candidates_p, params.temp);
llama_sample_temp(state->ctx, &candidates_p, params.temp);
return accept_token(llama_sample_token_mirostat(state->ctx, &candidates_p, params.mirostat_target_entropy, params.mirostat_learning_rate, mirostat_m, &mirostat_mu));
}
case 2: {
float mirostat_mu = 2.0f * params.mirostat_target_entropy;
llama_sample_temperature(state->ctx, &candidates_p, params.temp);
llama_sample_temp(state->ctx, &candidates_p, params.temp);
return accept_token(llama_sample_token_mirostat_v2(state->ctx, &candidates_p, params.mirostat_target_entropy, params.mirostat_learning_rate, &mirostat_mu));
}
default: LM_THROW("Invalid mirostat version "+std::to_string(params.prefer_mirostat), LM_BOOL_ERROR);
@ -203,7 +212,7 @@ public:
state->tokens.resize(old_token_count+state->prompt.size());
// Run tokenizer
const auto token_count = llama_tokenize(state->ctx, prompt.c_str(), state->tokens.data()+old_token_count, state->tokens.size()-old_token_count, was_empty);
const auto token_count = llama_tokenize(state->model, prompt.c_str(), prompt.size(), state->tokens.data()+old_token_count, state->tokens.size()-old_token_count, was_empty);
state->tokens.resize(old_token_count+token_count);
// Make sure token limit isn't being hit
@ -240,7 +249,7 @@ public:
continue;
}
state->tokens.push_back(0);
llama_tokenize(state->ctx, "\n", &state->tokens.back(), 1, false);
llama_tokenize(state->model, "\n", 1, &state->tokens.back(), 1, false);
id = state->tokens.back();
} else {
// Add token
@ -252,7 +261,7 @@ public:
// Get token as string
std::string str(14, ' ');
str.resize(llama_token_to_piece(state->ctx, id, str.data(), 14));
str.resize(llama_token_to_piece(state->model, id, str.data(), 14));
// Append string to function result
state->prompt.append(str);
@ -263,7 +272,8 @@ public:
else {
// Evaluate token
// TODO: Respect batch size
if (llama_eval(state->ctx, state->tokens.data()+state->tokens.size()-1, 1, state->tokens.size()-1, params.n_threads)) {
const auto batch = llama_batch_get_one(state->tokens.data()+state->tokens.size()-1, 1, state->tokens.size()-1, 0);
if (llama_decode(state->ctx, batch)) {
LM_COTHROW("Failed to evaluate new tokens", "");
}
}

@ -1 +1 @@
Subproject commit e8422de39e4aa2f7e50574124b060a80607e654a
Subproject commit 019ba1dcd0c7775a5ac0f7442634a330eb0173cc

View file

@ -314,6 +314,7 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
${GGML_OPENCL_SOURCES})
remove_nonexistent(GGML_SOURCES)
add_library(ggml${SUFFIX} OBJECT ${GGML_SOURCES})
target_compile_definitions(ggml${SUFFIX} PRIVATE _GNU_SOURCE)
if (LLAMA_K_QUANTS)
target_compile_definitions(ggml${SUFFIX} PUBLIC GGML_USE_K_QUANTS)