Skip to content

Commit

Permalink
llama : refactor gguf_buffer (WIP)
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Aug 14, 2023
1 parent 797088a commit 0678318
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 49 deletions.
39 changes: 19 additions & 20 deletions gguf-llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3012,11 +3012,11 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
// quantization
//

static void llama_convert_tensor_internal(const gguf_load_tensor & tensor, gguf_buffer & output, const int nelements, const int nthread) {
if (output.size < nelements * sizeof(float)) {
output.resize(nelements * sizeof(float));
static void llama_convert_tensor_internal(const gguf_load_tensor & tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
if (output.size() < nelements) {
output.resize(nelements);
}
float * f32_output = (float *) output.addr;
float * f32_output = (float *) output.data();

ggml_type_traits_t qtype;
if (ggml_is_quantized(tensor.type)) {
Expand Down Expand Up @@ -3134,10 +3134,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
};

size_t idx = 0;

std::vector<uint8_t> read_data;
std::vector<uint8_t> work;

for (gguf_load_tensor & tensor : model_loader->tensors_map.tensors) {
gguf_buffer read_data;
read_data.resize(tensor.size);
tensor.data = read_data.addr;
tensor.data = read_data.data();
model_loader->load_data_for(tensor);

LLAMA_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ",
Expand All @@ -3156,7 +3159,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
enum ggml_type new_type;
void * new_data;
size_t new_size;
gguf_buffer work;

if (!quantize) {
new_type = tensor.type;
Expand Down Expand Up @@ -3214,35 +3216,36 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
#endif

const size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);

float * f32_data;
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
gguf_buffer f32_conv_buf;
std::vector<float> f32_conv_buf;

if (tensor.type == GGML_TYPE_F32) {
f32_data = (float *) tensor.data;
} else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
} else {
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
f32_data = (float *) f32_conv_buf.addr;
f32_data = (float *) f32_conv_buf.data();
}

LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
fflush(stdout);

work.resize(nelements * 4); // upper bound on size
new_data = work.addr;
new_data = work.data();
std::vector<int64_t> hist_cur(1 << 4, 0);

int chunk_size = 32 * 512;
const int chunk_size = 32 * 512;
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
if (nthread_use < 2) {
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
} else {
size_t counter = 0;
new_size = 0;
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements] () {
std::vector<int64_t> local_hist;
size_t local_size = 0;
while (true) {
Expand Down Expand Up @@ -3315,8 +3318,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
}



//
// interface implementation
//
Expand Down Expand Up @@ -3565,7 +3566,6 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const

LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);


// create a temporary ggml context to store the lora tensors
// todo: calculate size from biggest possible tensor
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
Expand All @@ -3583,11 +3583,10 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
model_tensors.insert(kv);
}


// load base model
std::unique_ptr<llama_model_loader> model_loader;
ggml_context * base_ctx = NULL;
gguf_buffer base_buf;
std::vector<uint8_t> base_buf;
if (path_base_model) {
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
Expand All @@ -3598,8 +3597,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
base_buf.resize(ctx_size);

ggml_init_params base_params;
base_params.mem_size = base_buf.size;
base_params.mem_buffer = base_buf.addr;
base_params.mem_size = base_buf.size();
base_params.mem_buffer = base_buf.data();
base_params.no_alloc = model_loader->use_mmap;

base_ctx = ggml_init(base_params);
Expand Down
40 changes: 11 additions & 29 deletions gguf-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -476,66 +476,48 @@ struct gguf_mlock {

// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
struct gguf_buffer {
uint8_t * addr = NULL;
void * addr = NULL;
size_t size = 0;

gguf_buffer() = default;

void resize(size_t len) {
#ifdef GGML_USE_METAL
free(addr);
int result = posix_memalign((void **) &addr, getpagesize(), len);
if (result == 0) {
memset(addr, 0, len);
}
else {
addr = NULL;
}
#ifdef GGML_USE_METAL
const int result = posix_memalign((void **) &addr, getpagesize(), len);
GGML_ASSERT(result == 0);
#else
delete[] addr;
addr = new uint8_t[len];
addr = malloc(len);
#endif
GGML_ASSERT(addr);
size = len;
}

~gguf_buffer() {
#ifdef GGML_USE_METAL
free(addr);
#else
delete[] addr;
#endif
addr = NULL;
}

// disable copy and move
gguf_buffer(const gguf_buffer&) = delete;
gguf_buffer(gguf_buffer&&) = delete;
gguf_buffer& operator=(const gguf_buffer&) = delete;
gguf_buffer& operator=(gguf_buffer&&) = delete;
};

#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
struct gguf_ctx_buffer {
uint8_t * addr = NULL;
bool is_cuda;
size_t size = 0;

gguf_ctx_buffer() = default;
bool is_cuda = false;

void resize(size_t size) {
void resize(size_t len) {
free();

addr = (uint8_t *) ggml_cuda_host_malloc(size);
addr = (uint8_t *) ggml_cuda_host_malloc(len);
if (addr) {
is_cuda = true;
}
else {
// fall back to pageable memory
addr = new uint8_t[size];
addr = new uint8_t[len];
is_cuda = false;
}
this->size = size;
size = len;
}

void free() {
Expand Down

0 comments on commit 0678318

Please sign in to comment.