Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/concedo'
Browse files Browse the repository at this point in the history
  • Loading branch information
YellowRoseCx committed Aug 10, 2023
1 parent b7cb4cf commit bacc202
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 52 deletions.
13 changes: 9 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
# IT WILL NOT BE UPDATED OR MAINTAINED !!!

message(STATUS "============== ============== ==============")
message(STATUS "WARNING! Do NOT use this file. It is UNSUPPORTED for normal users. Use MAKE instead.")
message(STATUS "It is ONLY for CUBLAS build testing on windows visual studio. IT WILL NOT BE UPDATED OR MAINTAINED !!!")
message(STATUS "IF YOU ARE SEEING THIS, you MUST ONLY be building AN EXPERIMENAL WINDOWS CUBLAS BUILD! NOTHING ELSE WILL BE SUPPORTED !!!")
message(STATUS "WARNING! Recommend NOT to use this file. It is UNSUPPORTED for normal users. Use MAKE instead.")
message(STATUS "It is ONLY for CUBLAS builds on windows visual studio. IT WILL OVERWRITE YOUR EXISTING MAKEFILE !!!")
message(STATUS "IF YOU ARE SEEING THIS, you MUST ONLY be building CUBLAS BUILDS! NOTHING ELSE WILL BE SUPPORTED !!!")
message(STATUS "============== ============== ==============")

cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
Expand Down Expand Up @@ -110,7 +110,12 @@ if (LLAMA_CUBLAS)
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
message("CUDA Toolkit Version: ${CUDAToolkit_VERSION}")
if(CUDAToolkit_VERSION VERSION_GREATER 12)
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
endif()
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
Expand Down
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,11 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
else
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif
ifdef LLAMA_CUDA_MMQ_Y
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
else
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
endif # LLAMA_CUDA_MMQ_Y
#ifdef LLAMA_CUDA_CUBLAS
# NVCCFLAGS += -DGGML_CUDA_CUBLAS
#endif # LLAMA_CUDA_CUBLAS
Expand Down
15 changes: 9 additions & 6 deletions klite.embd

Large diffs are not rendered by default.

35 changes: 18 additions & 17 deletions koboldcpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ def utfprint(str):
maxhordelen = 256
modelbusy = threading.Lock()
defaultport = 5001
KcppVersion = "1.39.1"
KcppVersion = "1.40.1"
showdebug = True
showsamplerwarning = True
showmaxctxwarning = True
Expand Down Expand Up @@ -496,7 +496,7 @@ def do_GET(self):
laste = handle.get_last_eval_time()
lastc = handle.get_last_token_count()
stopreason = handle.get_last_stop_reason()
response_body = (json.dumps({"last_process":lastp,"last_eval":laste,"last_token_count":lastc, "stop_reason":stopreason}).encode())
response_body = (json.dumps({"last_process":lastp,"last_eval":laste,"last_token_count":lastc, "stop_reason":stopreason, "idle":(0 if modelbusy.locked() else 1)}).encode())

if response_body is None:
self.send_response(404)
Expand Down Expand Up @@ -674,7 +674,7 @@ def show_new_gui():
root.destroy()
if not args.model_param:
print("\nNo ggml model file was selected. Exiting.")
time.sleep(2)
time.sleep(3)
sys.exit(2)
return

Expand Down Expand Up @@ -1306,7 +1306,7 @@ def display_help():

if nextstate==0:
print("Exiting by user request.")
time.sleep(2)
time.sleep(3)
sys.exit()
elif nextstate==2:
time.sleep(0.1)
Expand All @@ -1317,7 +1317,7 @@ def display_help():

if not args.model_param:
print("\nNo ggml model file was selected. Exiting.")
time.sleep(2)
time.sleep(3)
sys.exit(2)

def show_gui_warning(issue=None):
Expand All @@ -1329,7 +1329,7 @@ def show_gui_warning(issue=None):
messagebox.showerror(title="No Backends Available!", message="KoboldCPP couldn't locate any backends to use.\n\nTo use the program, please run the 'make' command from the directory.")
root.destroy()
print("No Backend Available (i.e Default, OpenBLAS, CLBlast, CuBLAS). To use the program, please run the 'make' command from the directory.")
time.sleep(2)
time.sleep(3)
sys.exit(2)
else:
messagebox.showerror(title="New GUI failed, using Old GUI", message="The new GUI failed to load.\n\nTo use new GUI, please install the customtkinter python module.")
Expand Down Expand Up @@ -1423,7 +1423,7 @@ def onDropdownChange(event):

if launchclicked==False:
print("Exiting by user request.")
time.sleep(2)
time.sleep(3)
sys.exit()

#load all the vars
Expand Down Expand Up @@ -1479,7 +1479,7 @@ def onDropdownChange(event):
root.destroy()
if not args.model_param:
print("\nNo ggml model file was selected. Exiting.")
time.sleep(2)
time.sleep(3)
sys.exit(2)

else:
Expand All @@ -1489,7 +1489,7 @@ def onDropdownChange(event):
root.destroy()
if not args.model_param:
print("\nNo ggml model file was selected. Exiting.")
time.sleep(2)
time.sleep(3)
sys.exit(2)

#A very simple and stripped down embedded horde worker with no dependencies
Expand Down Expand Up @@ -1534,7 +1534,7 @@ def make_url_request(url, data, method='POST'):
BRIDGE_AGENT = f"KoboldCppEmbedWorker:1:https://github.com/LostRuins/koboldcpp"
cluster = "https://horde.koboldai.net"
while exitcounter < 10:
time.sleep(2)
time.sleep(3)
readygo = make_url_request(f'{epurl}/api/v1/info/version', None,'GET')
if readygo:
print("Embedded Horde Worker is started.")
Expand Down Expand Up @@ -1610,10 +1610,10 @@ def make_url_request(url, data, method='POST'):
time.sleep(1)
if exitcounter<100:
print("Horde Worker Shutdown - Too many errors.")
time.sleep(2)
time.sleep(3)
else:
print("Horde Worker Shutdown - Server Closing.")
time.sleep(1)
time.sleep(2)
sys.exit(2)

def main(args):
Expand All @@ -1637,7 +1637,7 @@ def main(args):
except Exception as ex2:
print("File selection GUI unsupported. Please check command line: script.py --help")
print("Reason for no GUI: " + str(ex2))
time.sleep(2)
time.sleep(3)
sys.exit(2)

if args.hordeconfig and args.hordeconfig[0]!="":
Expand Down Expand Up @@ -1681,20 +1681,20 @@ def main(args):
time.sleep(1)
if not os.path.exists(args.model_param):
print(f"Cannot find model file: {args.model_param}")
time.sleep(2)
time.sleep(3)
sys.exit(2)

if args.lora and args.lora[0]!="":
if not os.path.exists(args.lora[0]):
print(f"Cannot find lora file: {args.lora[0]}")
time.sleep(2)
time.sleep(3)
sys.exit(2)
else:
args.lora[0] = os.path.abspath(args.lora[0])
if len(args.lora) > 1:
if not os.path.exists(args.lora[1]):
print(f"Cannot find lora base: {args.lora[1]}")
time.sleep(2)
time.sleep(3)
sys.exit(2)
else:
args.lora[1] = os.path.abspath(args.lora[1])
Expand All @@ -1715,7 +1715,7 @@ def main(args):

if not loadok:
print("Could not load model: " + modelname)
time.sleep(2)
time.sleep(3)
sys.exit(3)
try:
basepath = os.path.abspath(os.path.dirname(__file__))
Expand Down Expand Up @@ -1743,6 +1743,7 @@ def main(args):

if args.hordeconfig and len(args.hordeconfig)>4:
horde_thread = threading.Thread(target=run_horde_worker,args=(args,args.hordeconfig[3],args.hordeconfig[4]))
horde_thread.daemon = True
horde_thread.start()

print(f"Please connect to custom endpoint at {epurl}")
Expand Down
19 changes: 12 additions & 7 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
{ MODEL_7B, 512ull * kB },
{ MODEL_13B, 640ull * kB },
{ MODEL_30B, 768ull * kB },
{ MODEL_65B, 1280ull * kB },
{ MODEL_70B, 1280ull * kB },
{ MODEL_65B, 1360ull * kB },
{ MODEL_70B, 1360ull * kB },
};
return k_sizes;
}
Expand All @@ -173,8 +173,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
{ MODEL_7B, 128ull },
{ MODEL_13B, 160ull },
{ MODEL_30B, 208ull },
{ MODEL_65B, 256ull },
{ MODEL_70B, 256ull },
{ MODEL_65B, 320ull },
{ MODEL_70B, 320ull },
};
return k_sizes;
}
Expand Down Expand Up @@ -937,6 +937,11 @@ bool llama_mlock_supported() {
return llama_mlock::SUPPORTED;
}

int get_blas_batch_mul(int batch)
{
return (batch>512?(batch>1024?4:2):1);
}

void llama_backend_init(bool numa) {
ggml_time_init();

Expand Down Expand Up @@ -1042,7 +1047,7 @@ static void llama_model_load_internal(
void * progress_callback_user_data) {

model.t_start_us = ggml_time_us();
size_t blasbatchmul = (n_batch>512?(n_batch>1024?4:2):1);
size_t blasbatchmul = get_blas_batch_mul(n_batch);

std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));

Expand Down Expand Up @@ -1076,7 +1081,7 @@ static void llama_model_load_internal(
// LLaMAv2
// TODO: temporary until GGUF
//patch for llama2 gqa
if (model.type == e_model::MODEL_65B && hparams.n_mult == 4096) {
if (model.type == e_model::MODEL_65B && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) {
fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__);
n_gqa = 8;
}
Expand Down Expand Up @@ -3248,7 +3253,7 @@ struct llama_context * llama_new_context_with_model(
params.seed = time(NULL);
}

size_t blasbatchmul = (params.n_batch>512?2:1);
size_t blasbatchmul = get_blas_batch_mul(params.n_batch);

unsigned cur_percentage = 0;
if (params.progress_callback == NULL) {
Expand Down
44 changes: 26 additions & 18 deletions model_adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,28 +133,36 @@ void print_tok_vec(std::vector<float> &embd)
else if(vocabsiz==50257 || (vocabsiz>=49152&&vocabsiz<=49157)) //49152-6 is starcoder
{
fileformat = FileFormat::GPT2_1;
uint32_t temp;
fin.read((char *)&temp, sizeof(temp)); //ctx
fin.read((char *)&temp, sizeof(temp)); //n_embd
fin.read((char *)&temp, sizeof(temp)); //n_head
uint32_t temp, v1,v2,v3;
fin.read((char *)&v1, sizeof(temp)); //ctx
fin.read((char *)&v2, sizeof(temp)); //n_embd
fin.read((char *)&v3, sizeof(temp)); //n_head
fin.read((char *)&temp, sizeof(temp)); //n_layer
fin.read((char *)&temp, sizeof(temp)); //f16
const int32_t qntvr = temp / 1000;
temp %= 1000;
if (qntvr != 0)
if(vocabsiz==49152 && v1==4096 && v2==2560 && v3==32 && temp==32)
{
if (qntvr == 1)
{
fileformat = FileFormat::GPT2_3;
}
else
{
fileformat = FileFormat::GPT2_4;
}
//special case, Stablecode Completion Alpha 3B
fileformat = FileFormat::NEOX_6;
}
else if (temp != 0 && temp != 1)
else
{
fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type
fin.read((char *)&temp, sizeof(temp)); //f16
const int32_t qntvr = temp / 1000;
temp %= 1000;
if (qntvr != 0)
{
if (qntvr == 1)
{
fileformat = FileFormat::GPT2_3;
}
else
{
fileformat = FileFormat::GPT2_4;
}
}
else if (temp != 0 && temp != 1)
{
fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type
}
}
}
else if(vocabsiz < 31998 || vocabsiz > 33000)
Expand Down

0 comments on commit bacc202

Please sign in to comment.