|
|
|
@ -2,6 +2,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
#include "ggml.h"
|
|
|
|
#include "ggml.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <sys/stat.h>
|
|
|
|
|
|
|
|
|
|
|
|
#include <cinttypes>
|
|
|
|
#include <cinttypes>
|
|
|
|
#include <fstream>
|
|
|
|
#include <fstream>
|
|
|
|
#include <random>
|
|
|
|
#include <random>
|
|
|
|
@ -127,7 +129,8 @@ static bool llama_model_load(
|
|
|
|
int n_ctx,
|
|
|
|
int n_ctx,
|
|
|
|
int n_parts,
|
|
|
|
int n_parts,
|
|
|
|
ggml_type memory_type,
|
|
|
|
ggml_type memory_type,
|
|
|
|
bool vocab_only) {
|
|
|
|
bool vocab_only,
|
|
|
|
|
|
|
|
llama_progress_handler progress) {
|
|
|
|
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
|
|
|
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
|
|
|
|
|
|
|
|
|
|
|
const int64_t t_start_us = ggml_time_us();
|
|
|
|
const int64_t t_start_us = ggml_time_us();
|
|
|
|
@ -394,6 +397,10 @@ static bool llama_model_load(
|
|
|
|
|
|
|
|
|
|
|
|
std::vector<uint8_t> tmp;
|
|
|
|
std::vector<uint8_t> tmp;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (progress.handler) {
|
|
|
|
|
|
|
|
progress.handler(0, progress.ctx);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < n_parts; ++i) {
|
|
|
|
for (int i = 0; i < n_parts; ++i) {
|
|
|
|
const int part_id = i;
|
|
|
|
const int part_id = i;
|
|
|
|
//const int part_id = n_parts - i - 1;
|
|
|
|
//const int part_id = n_parts - i - 1;
|
|
|
|
@ -409,6 +416,11 @@ static bool llama_model_load(
|
|
|
|
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
|
|
|
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
|
|
|
fin.seekg(file_offset);
|
|
|
|
fin.seekg(file_offset);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// stat the file for file size
|
|
|
|
|
|
|
|
struct stat st;
|
|
|
|
|
|
|
|
stat(fname_part.c_str(), &st);
|
|
|
|
|
|
|
|
const size_t file_size = st.st_size;
|
|
|
|
|
|
|
|
|
|
|
|
// load weights
|
|
|
|
// load weights
|
|
|
|
{
|
|
|
|
{
|
|
|
|
int n_tensors = 0;
|
|
|
|
int n_tensors = 0;
|
|
|
|
@ -579,6 +591,10 @@ static bool llama_model_load(
|
|
|
|
|
|
|
|
|
|
|
|
//fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
|
|
|
//fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
|
|
|
if (++n_tensors % 8 == 0) {
|
|
|
|
if (++n_tensors % 8 == 0) {
|
|
|
|
|
|
|
|
if (progress.handler) {
|
|
|
|
|
|
|
|
double current_progress = (double(i) + (double(fin.tellg()) / double(file_size))) / double(n_parts);
|
|
|
|
|
|
|
|
progress.handler(current_progress, progress.ctx);
|
|
|
|
|
|
|
|
}
|
|
|
|
fprintf(stderr, ".");
|
|
|
|
fprintf(stderr, ".");
|
|
|
|
fflush(stderr);
|
|
|
|
fflush(stderr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
@ -596,6 +612,10 @@ static bool llama_model_load(
|
|
|
|
|
|
|
|
|
|
|
|
lctx.t_load_us = ggml_time_us() - t_start_us;
|
|
|
|
lctx.t_load_us = ggml_time_us() - t_start_us;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (progress.handler) {
|
|
|
|
|
|
|
|
progress.handler(1, progress.ctx);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@ -1394,7 +1414,8 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
|
|
|
|
|
|
|
|
|
|
|
|
struct llama_context * llama_init_from_file(
|
|
|
|
struct llama_context * llama_init_from_file(
|
|
|
|
const char * path_model,
|
|
|
|
const char * path_model,
|
|
|
|
struct llama_context_params params) {
|
|
|
|
struct llama_context_params params,
|
|
|
|
|
|
|
|
llama_progress_handler progress) {
|
|
|
|
ggml_time_init();
|
|
|
|
ggml_time_init();
|
|
|
|
|
|
|
|
|
|
|
|
llama_context * ctx = new llama_context;
|
|
|
|
llama_context * ctx = new llama_context;
|
|
|
|
@ -1408,7 +1429,7 @@ struct llama_context * llama_init_from_file(
|
|
|
|
|
|
|
|
|
|
|
|
ggml_type type_memory = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
|
|
|
ggml_type type_memory = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
|
|
|
|
|
|
|
|
|
|
|
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory, params.vocab_only)) {
|
|
|
|
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory, params.vocab_only, progress)) {
|
|
|
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
|
|
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
|
|
|
delete ctx;
|
|
|
|
delete ctx;
|
|
|
|
return nullptr;
|
|
|
|
return nullptr;
|
|
|
|
|