mirror of https://github.com/novarobot/llama.cpp
Compare commits
58 Commits
master-8cf
...
master
| Author | SHA1 | Date |
|---|---|---|
|
|
b6b268d441 | 3 years ago |
|
|
3cd8dde0d1 | 3 years ago |
|
|
4870e455b3 | 3 years ago |
|
|
483bab2e3d | 3 years ago |
|
|
404e1da38e | 3 years ago |
|
|
4cc053b6d5 | 3 years ago |
|
|
0ba5a3a9a5 | 3 years ago |
|
|
2e17dfd80a | 3 years ago |
|
|
20a1a4e09c | 3 years ago |
|
|
ad072fc5ad | 3 years ago |
|
|
ea10d3ded2 | 3 years ago |
|
|
a18c19259a | 3 years ago |
|
|
a50e39c6fe | 3 years ago |
|
|
a140219e81 | 3 years ago |
|
|
8a3e5ef801 | 3 years ago |
|
|
8eea5ae0e5 | 3 years ago |
|
|
93208cfb92 | 3 years ago |
|
|
03ace14cfd | 3 years ago |
|
|
e4412b45e3 | 3 years ago |
|
|
f7dc43bc0d | 3 years ago |
|
|
ee8a788786 | 3 years ago |
|
|
69c92298a9 | 3 years ago |
|
|
97940520e8 | 3 years ago |
|
|
305ba6f0e6 | 3 years ago |
|
|
4122dffff9 | 3 years ago |
|
|
56e659a0b2 | 3 years ago |
|
|
40ea807a97 | 3 years ago |
|
|
d5850c53ca | 3 years ago |
|
|
ae44e23ee3 | 3 years ago |
|
|
928480ef5b | 3 years ago |
|
|
56817b1f88 | 3 years ago |
|
|
f5a77a629b | 3 years ago |
|
|
da0e9fe90c | 3 years ago |
|
|
e6c9e0986c | 3 years ago |
|
|
01a297b099 | 3 years ago |
|
|
3366853e41 | 3 years ago |
|
|
3f9c6135e4 | 3 years ago |
|
|
0f61352708 | 3 years ago |
|
|
353ec251a4 | 3 years ago |
|
|
89d5d90f3b | 3 years ago |
|
|
16ffc013c6 | 3 years ago |
|
|
486ae645fd | 3 years ago |
|
|
3ab3e6582f | 3 years ago |
|
|
f157088cb7 | 3 years ago |
|
|
c86ba036e6 | 3 years ago |
|
|
1daf4dd712 | 3 years ago |
|
|
dc6a845b85 | 3 years ago |
|
|
6a612959e1 | 3 years ago |
|
|
d5f56a5e5a | 3 years ago |
|
|
3bfa3b43b7 | 3 years ago |
|
|
715d292ee0 | 3 years ago |
|
|
c98ae02668 | 3 years ago |
|
|
c3b2306b18 | 3 years ago |
|
|
975d2cebf9 | 3 years ago |
|
|
e0ffc861fa | 3 years ago |
|
|
8f644a0a85 | 3 years ago |
|
|
eb34620aec | 3 years ago |
|
|
2e664f1ff4 | 3 years ago |
@ -0,0 +1,185 @@
|
||||
---
|
||||
name: Issue and enhancement template
|
||||
about: Used to report issues and request enhancements for llama.cpp
|
||||
title: "[User] Insert summary of your issue or enhancement.."
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
# Prerequisites
|
||||
|
||||
Please answer the following questions for yourself before submitting an issue.
|
||||
|
||||
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
|
||||
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
||||
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
|
||||
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
|
||||
|
||||
# Expected Behavior
|
||||
|
||||
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do.
|
||||
|
||||
# Current Behavior
|
||||
|
||||
Please provide a detailed written description of what `llama.cpp` did, instead.
|
||||
|
||||
# Environment and Context
|
||||
|
||||
Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
|
||||
|
||||
* Physical (or virtual) hardware you are using, e.g. for Linux:
|
||||
|
||||
`$ lscpu`
|
||||
|
||||
* Operating System, e.g. for Linux:
|
||||
|
||||
`$ uname -a`
|
||||
|
||||
* SDK version, e.g. for Linux:
|
||||
|
||||
```
|
||||
$ python3 --version
|
||||
$ make --version
|
||||
$ g++ --version
|
||||
```
|
||||
|
||||
# Failure Information (for bugs)
|
||||
|
||||
Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
|
||||
|
||||
# Steps to Reproduce
|
||||
|
||||
Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
|
||||
|
||||
1. step 1
|
||||
2. step 2
|
||||
3. step 3
|
||||
4. etc.
|
||||
|
||||
# Failure Logs
|
||||
|
||||
Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
|
||||
|
||||
Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
|
||||
|
||||
Example environment info:
|
||||
```
|
||||
llama.cpp$ git log | head -1
|
||||
commit 2af23d30434a677c6416812eea52ccc0af65119c
|
||||
|
||||
llama.cpp$ lscpu | egrep "AMD|Flags"
|
||||
Vendor ID: AuthenticAMD
|
||||
Model name: AMD Ryzen Threadripper 1950X 16-Core Processor
|
||||
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid amd_dcm aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb hw_pstate ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 xsaves clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif overflow_recov succor smca sme sev
|
||||
Virtualization: AMD-V
|
||||
|
||||
llama.cpp$ python3 --version
|
||||
Python 3.10.9
|
||||
|
||||
llama.cpp$ pip list | egrep "torch|numpy|sentencepiece"
|
||||
numpy 1.24.2
|
||||
numpydoc 1.5.0
|
||||
sentencepiece 0.1.97
|
||||
torch 1.13.1
|
||||
torchvision 0.14.1
|
||||
|
||||
llama.cpp$ make --version | head -1
|
||||
GNU Make 4.3
|
||||
|
||||
$ md5sum ./models/65B/ggml-model-q4_0.bin
|
||||
dbdd682cce80e2d6e93cefc7449df487 ./models/65B/ggml-model-q4_0.bin
|
||||
```
|
||||
|
||||
Example run with the Linux command [perf](https://www.brendangregg.com/perf.html)
|
||||
```
|
||||
llama.cpp$ perf stat ./main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p "Please close your issue when it has been answered."
|
||||
main: seed = 1679149377
|
||||
llama_model_load: loading model from './models/65B/ggml-model-q4_0.bin' - please wait ...
|
||||
llama_model_load: n_vocab = 32000
|
||||
llama_model_load: n_ctx = 512
|
||||
llama_model_load: n_embd = 8192
|
||||
llama_model_load: n_mult = 256
|
||||
llama_model_load: n_head = 64
|
||||
llama_model_load: n_layer = 80
|
||||
llama_model_load: n_rot = 128
|
||||
llama_model_load: f16 = 2
|
||||
llama_model_load: n_ff = 22016
|
||||
llama_model_load: n_parts = 8
|
||||
llama_model_load: ggml ctx size = 41477.73 MB
|
||||
llama_model_load: memory_size = 2560.00 MB, n_mem = 40960
|
||||
llama_model_load: loading model part 1/8 from './models/65B/ggml-model-q4_0.bin'
|
||||
llama_model_load: .......................................................................................... done
|
||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
||||
llama_model_load: loading model part 2/8 from './models/65B/ggml-model-q4_0.bin.1'
|
||||
llama_model_load: .......................................................................................... done
|
||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
||||
llama_model_load: loading model part 3/8 from './models/65B/ggml-model-q4_0.bin.2'
|
||||
llama_model_load: .......................................................................................... done
|
||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
||||
llama_model_load: loading model part 4/8 from './models/65B/ggml-model-q4_0.bin.3'
|
||||
llama_model_load: .......................................................................................... done
|
||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
||||
llama_model_load: loading model part 5/8 from './models/65B/ggml-model-q4_0.bin.4'
|
||||
llama_model_load: .......................................................................................... done
|
||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
||||
llama_model_load: loading model part 6/8 from './models/65B/ggml-model-q4_0.bin.5'
|
||||
llama_model_load: .......................................................................................... done
|
||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
||||
llama_model_load: loading model part 7/8 from './models/65B/ggml-model-q4_0.bin.6'
|
||||
llama_model_load: .......................................................................................... done
|
||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
||||
llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.7'
|
||||
llama_model_load: .......................................................................................... done
|
||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
||||
|
||||
system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
|
||||
|
||||
main: prompt: 'Please close your issue when it has been answered.'
|
||||
main: number of tokens in prompt = 11
|
||||
1 -> ''
|
||||
12148 -> 'Please'
|
||||
3802 -> ' close'
|
||||
596 -> ' your'
|
||||
2228 -> ' issue'
|
||||
746 -> ' when'
|
||||
372 -> ' it'
|
||||
756 -> ' has'
|
||||
1063 -> ' been'
|
||||
7699 -> ' answered'
|
||||
29889 -> '.'
|
||||
|
||||
sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000
|
||||
|
||||
|
||||
Please close your issue when it has been answered.
|
||||
@duncan-donut: I'm trying to figure out what kind of "support" you need for this script and why, exactly? Is there a question about how the code works that hasn't already been addressed in one or more comments below this ticket, or are we talking something else entirely like some sorta bugfixing job because your server setup is different from mine??
|
||||
I can understand if your site needs to be running smoothly and you need help with a fix of sorts but there should really be nothing wrong here that the code itself could not handle. And given that I'm getting reports about how it works perfectly well on some other servers, what exactly are we talking? A detailed report will do wonders in helping us get this resolved for ya quickly so please take your time and describe the issue(s) you see as clearly & concisely as possible!!
|
||||
@duncan-donut: I'm not sure if you have access to cPanel but you could try these instructions. It is worth a shot! Let me know how it goes (or what error message, exactly!) when/if ya give that code a go? [end of text]
|
||||
|
||||
|
||||
main: mem per token = 71159620 bytes
|
||||
main: load time = 19309.95 ms
|
||||
main: sample time = 168.62 ms
|
||||
main: predict time = 223895.61 ms / 888.47 ms per token
|
||||
main: total time = 246406.42 ms
|
||||
|
||||
Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
|
||||
|
||||
3636882.89 msec task-clock # 14.677 CPUs utilized
|
||||
13509 context-switches # 3.714 /sec
|
||||
2436 cpu-migrations # 0.670 /sec
|
||||
10476679 page-faults # 2.881 K/sec
|
||||
13133115082869 cycles # 3.611 GHz (16.77%)
|
||||
29314462753 stalled-cycles-frontend # 0.22% frontend cycles idle (16.76%)
|
||||
10294402631459 stalled-cycles-backend # 78.39% backend cycles idle (16.74%)
|
||||
23479217109614 instructions # 1.79 insn per cycle
|
||||
# 0.44 stalled cycles per insn (16.76%)
|
||||
2353072268027 branches # 647.002 M/sec (16.77%)
|
||||
1998682780 branch-misses # 0.08% of all branches (16.76%)
|
||||
|
||||
247.802177522 seconds time elapsed
|
||||
|
||||
3618.573072000 seconds user
|
||||
18.491698000 seconds sys
|
||||
```
|
||||
@ -0,0 +1,20 @@
|
||||
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
|
||||
7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
|
||||
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
|
||||
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
|
||||
4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json
|
||||
e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth
|
||||
4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth
|
||||
24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth
|
||||
1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth
|
||||
2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json
|
||||
135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth
|
||||
9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth
|
||||
e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770 models/65B/consolidated.02.pth
|
||||
73176ffb426b40482f2aa67ae1217ef79fbbd1fff5482bae5060cdc5a24ab70e models/65B/consolidated.03.pth
|
||||
882e6431d0b08a8bc66261a0d3607da21cbaeafa96a24e7e59777632dbdac225 models/65B/consolidated.04.pth
|
||||
a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/consolidated.05.pth
|
||||
72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth
|
||||
d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth
|
||||
999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json
|
||||
9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model
|
||||
@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
./main -m ./models/7B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||
@ -0,0 +1,167 @@
|
||||
# Convert a GPTQ quantized LLaMA model to a ggml compatible file
|
||||
# Based on: https://github.com/qwopqwop200/GPTQ-for-LLaMa
|
||||
#
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import struct
|
||||
import numpy as np
|
||||
import torch
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
if len(sys.argv) != 4:
|
||||
print("Usage: convert-gptq-to-ggml.py llamaXXb-4bit.pt tokenizer.model out.bin\n")
|
||||
sys.exit(1)
|
||||
|
||||
fname_model = sys.argv[1]
|
||||
fname_tokenizer = sys.argv[2]
|
||||
dir_out = sys.argv[3]
|
||||
|
||||
model = torch.load(fname_model, map_location="cpu")
|
||||
|
||||
n_vocab, n_embd = model['model.embed_tokens.weight'].shape
|
||||
n_layer = 1 + max(int(m.group(1)) for name in model
|
||||
if (m := re.match(r'model\.layers\.([0-9]+)', name)))
|
||||
|
||||
# hardcoded:
|
||||
n_mult = 256
|
||||
n_head = {32: 32, 40: 40, 60: 52, 80: 64}[n_layer]
|
||||
|
||||
tokenizer = SentencePieceProcessor(fname_tokenizer)
|
||||
|
||||
assert tokenizer.vocab_size() == n_vocab
|
||||
|
||||
fname_out = sys.argv[3]
|
||||
|
||||
fout = open(fname_out, "wb")
|
||||
|
||||
fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
|
||||
fout.write(struct.pack("i", 1)) # file version
|
||||
fout.write(struct.pack("i", n_vocab))
|
||||
fout.write(struct.pack("i", n_embd))
|
||||
fout.write(struct.pack("i", n_mult))
|
||||
fout.write(struct.pack("i", n_head))
|
||||
fout.write(struct.pack("i", n_layer))
|
||||
fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete)
|
||||
fout.write(struct.pack("i", 4))
|
||||
|
||||
|
||||
# This loop unchanged from convert-pth-to-ggml.py:
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
text = " \u2047 ".encode("utf-8")
|
||||
elif tokenizer.is_control(i):
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
if len(piece) != 6:
|
||||
print(f"Invalid token: {piece}")
|
||||
sys.exit(1)
|
||||
byte_value = int(piece[3:-1], 16)
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
def write_header(shape, dst_name, ftype_cur):
|
||||
sname = dst_name.encode('utf-8')
|
||||
fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
|
||||
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
|
||||
fout.write(sname)
|
||||
|
||||
def convert_non_q4(src_name, dst_name):
|
||||
v = model[src_name]
|
||||
shape = v.shape
|
||||
print("Processing non-Q4 variable: " + src_name + " with shape: ", shape, " and type: ", v.dtype)
|
||||
if len(shape) == 1:
|
||||
print(" Converting to float32")
|
||||
v = v.to(torch.float32)
|
||||
|
||||
ftype_cur = {torch.float16: 1, torch.float32: 0}[v.dtype]
|
||||
|
||||
# header
|
||||
write_header(shape, dst_name, ftype_cur)
|
||||
|
||||
# data
|
||||
v.numpy().tofile(fout)
|
||||
|
||||
def convert_q4(src_name, dst_name, permute=False):
|
||||
zeros = model[f"{src_name}.zeros"].numpy()
|
||||
scales = model[f"{src_name}.scales"].numpy()
|
||||
bias = model[f"{src_name}.bias"].numpy()
|
||||
qweight = model[f"{src_name}.qweight"].numpy().T # transpose
|
||||
|
||||
# Q4_1 does not support bias; good thing the bias is always all zeros.
|
||||
assert not np.any(bias)
|
||||
|
||||
# Each int32 item is actually 8 int4 items packed together, and it's transposed.
|
||||
shape = (qweight.shape[0], qweight.shape[1] * 8)
|
||||
|
||||
print("Processing Q4 variable: " + src_name + " with shape: ", shape)
|
||||
|
||||
# The output format has the int4 weights in groups of 32 rather than 8.
|
||||
# It looks like this:
|
||||
# For each row:
|
||||
# For each group of 32 columns:
|
||||
# - addend (float32, 4 bytes)
|
||||
# - scale (float32, 4 bytes)
|
||||
# - weights (int4 * 32, 16 bytes)
|
||||
# Note that in the input, the scales and addends are shared between all
|
||||
# the columns in a row, so we end up wasting quite a bit of memory with
|
||||
# repeated scales and addends.
|
||||
|
||||
addends = -zeros # flip sign
|
||||
|
||||
# Since the output format is mixed between integers and floats, we have
|
||||
# to hackily view the floats as int32s just so numpy will let us
|
||||
# concatenate them.
|
||||
addends_view = addends.view(dtype=np.int32)
|
||||
scales_view = scales.view(dtype=np.int32)
|
||||
|
||||
# Split into groups of 4 columns (i.e. 32 columns of quantized data):
|
||||
grouped = qweight.reshape([qweight.shape[0], qweight.shape[1] // 4, 4])
|
||||
|
||||
# Repeat addends and scales:
|
||||
addends_rep = np.atleast_3d(addends_view).repeat(grouped.shape[1], axis=1)
|
||||
scales_rep = np.atleast_3d(scales_view).repeat(grouped.shape[1], axis=1)
|
||||
|
||||
blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting='no')
|
||||
|
||||
if permute:
|
||||
# Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py.
|
||||
# This can be done after the above conversion because it doesn't affect column order/layout.
|
||||
blob = (blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(blob.shape))
|
||||
|
||||
# header
|
||||
write_header(shape, dst_name, 3) # ftype = Q4_1
|
||||
|
||||
# data
|
||||
blob.tofile(fout)
|
||||
|
||||
convert_non_q4("model.embed_tokens.weight", "tok_embeddings.weight")
|
||||
convert_non_q4("model.norm.weight", "norm.weight")
|
||||
convert_non_q4("lm_head.weight", "output.weight")
|
||||
|
||||
for i in range(n_layer):
|
||||
convert_q4(f"model.layers.{i}.self_attn.q_proj", f"layers.{i}.attention.wq.weight", permute=True)
|
||||
convert_q4(f"model.layers.{i}.self_attn.k_proj", f"layers.{i}.attention.wk.weight", permute=True)
|
||||
convert_q4(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight")
|
||||
convert_q4(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight")
|
||||
|
||||
convert_q4(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight")
|
||||
convert_q4(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight")
|
||||
convert_q4(f"model.layers.{i}.mlp.up_proj", f"layers.{i}.feed_forward.w3.weight")
|
||||
|
||||
convert_non_q4(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight")
|
||||
convert_non_q4(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight")
|
||||
|
||||
|
||||
fout.close()
|
||||
|
||||
print("Done. Output file: " + fname_out)
|
||||
print("")
|
||||
@ -1,66 +0,0 @@
|
||||
import os
|
||||
import sys
|
||||
from tqdm import tqdm
|
||||
import requests
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: download-pth.py dir-model model-type\n")
|
||||
print(" model-type: Available models 7B, 13B, 30B or 65B")
|
||||
sys.exit(1)
|
||||
|
||||
modelsDir = sys.argv[1]
|
||||
model = sys.argv[2]
|
||||
|
||||
num = {
|
||||
"7B": 1,
|
||||
"13B": 2,
|
||||
"30B": 4,
|
||||
"65B": 8,
|
||||
}
|
||||
|
||||
if model not in num:
|
||||
print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Downloading model {model}")
|
||||
|
||||
files = ["checklist.chk", "params.json"]
|
||||
|
||||
for i in range(num[model]):
|
||||
files.append(f"consolidated.0{i}.pth")
|
||||
|
||||
resolved_path = os.path.abspath(os.path.join(modelsDir, model))
|
||||
os.makedirs(resolved_path, exist_ok=True)
|
||||
|
||||
for file in files:
|
||||
dest_path = os.path.join(resolved_path, file)
|
||||
|
||||
if os.path.exists(dest_path):
|
||||
print(f"Skip file download, it already exists: {file}")
|
||||
continue
|
||||
|
||||
url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
|
||||
response = requests.get(url, stream=True)
|
||||
with open(dest_path, 'wb') as f:
|
||||
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
|
||||
for chunk in response.iter_content(chunk_size=1024):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
t.update(len(chunk))
|
||||
|
||||
files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
|
||||
for file in files2:
|
||||
dest_path = os.path.join(modelsDir, file)
|
||||
|
||||
if os.path.exists(dest_path):
|
||||
print(f"Skip file download, it already exists: {file}")
|
||||
continue
|
||||
|
||||
url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
|
||||
response = requests.get(url, stream=True)
|
||||
with open(dest_path, 'wb') as f:
|
||||
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
|
||||
for chunk in response.iter_content(chunk_size=1024):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
t.update(len(chunk))
|
||||
@ -0,0 +1,139 @@
|
||||
#ifndef LLAMA_H
|
||||
#define LLAMA_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef LLAMA_SHARED
|
||||
# ifdef _WIN32
|
||||
# ifdef LLAMA_BUILD
|
||||
# define LLAMA_API __declspec(dllexport)
|
||||
# else
|
||||
# define LLAMA_API __declspec(dllimport)
|
||||
# endif
|
||||
# else
|
||||
# define LLAMA_API __attribute__ ((visibility ("default")))
|
||||
# endif
|
||||
#else
|
||||
# define LLAMA_API
|
||||
#endif
|
||||
|
||||
#define LLAMA_FILE_VERSION 1
|
||||
#define LLAMA_FILE_MAGIC 0x67676d66 // 'ggmf' in hex
|
||||
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// C interface
|
||||
//
|
||||
// TODO: show sample usage
|
||||
//
|
||||
|
||||
struct llama_context;
|
||||
|
||||
typedef int llama_token;
|
||||
|
||||
typedef struct llama_token_data {
|
||||
llama_token id; // token id
|
||||
|
||||
float p; // probability of the token
|
||||
float plog; // log probability of the token
|
||||
|
||||
} llama_token_data;
|
||||
|
||||
struct llama_context_params {
|
||||
int n_ctx; // text context
|
||||
int n_parts; // -1 for default
|
||||
int seed; // RNG seed, 0 for random
|
||||
|
||||
bool f16_kv; // use fp16 for KV cache
|
||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
};
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
|
||||
// Various functions for loading a ggml llama model.
|
||||
// Allocate (almost) all memory needed for the model.
|
||||
// Return NULL on failure
|
||||
LLAMA_API struct llama_context * llama_init_from_file(
|
||||
const char * path_model,
|
||||
struct llama_context_params params);
|
||||
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
// TODO: not great API - very likely to change
|
||||
// Returns 0 on success
|
||||
LLAMA_API int llama_model_quantize(
|
||||
const char * fname_inp,
|
||||
const char * fname_out,
|
||||
int itype,
|
||||
int qk);
|
||||
|
||||
// Run the llama inference to obtain the logits and probabilities for the next token.
|
||||
// tokens + n_tokens is the provided batch of new tokens to process
|
||||
// n_past is the number of tokens to use from previous eval calls
|
||||
// Returns 0 on success
|
||||
LLAMA_API int llama_eval(
|
||||
struct llama_context * ctx,
|
||||
const llama_token * tokens,
|
||||
int n_tokens,
|
||||
int n_past,
|
||||
int n_threads);
|
||||
|
||||
// Convert the provided text into tokens.
|
||||
// The tokens pointer must be large enough to hold the resulting tokens.
|
||||
// Returns the number of tokens on success, no more than n_max_tokens
|
||||
// Returns a negative number on failure - the number of tokens that would have been returned
|
||||
// TODO: not sure if correct
|
||||
LLAMA_API int llama_tokenize(
|
||||
struct llama_context * ctx,
|
||||
const char * text,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_n_vocab(struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_ctx (struct llama_context * ctx);
|
||||
|
||||
// Token logits obtained from the last call to llama_eval()
|
||||
// The logits for the last token are stored in the last row
|
||||
// Can be mutated in order to change the probabilities of the next token
|
||||
// Rows: n_tokens
|
||||
// Cols: n_vocab
|
||||
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||
|
||||
// Token Id -> String. Uses the vocabulary in the provided context
|
||||
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos();
|
||||
LLAMA_API llama_token llama_token_eos();
|
||||
|
||||
// TODO: improve the last_n_tokens interface ?
|
||||
LLAMA_API llama_token llama_sample_top_p_top_k(
|
||||
llama_context * ctx,
|
||||
const llama_token * last_n_tokens_data,
|
||||
int last_n_tokens_size,
|
||||
int top_k,
|
||||
double top_p,
|
||||
double temp,
|
||||
double repeat_penalty);
|
||||
|
||||
// Performance information
|
||||
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
||||
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
||||
|
||||
// Print system information
|
||||
LLAMA_API const char * llama_print_system_info(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
Binary file not shown.
@ -0,0 +1,9 @@
|
||||
function(llama_add_test source)
|
||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||
add_executable(${TEST_TARGET} ${source})
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
|
||||
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||
endfunction()
|
||||
|
||||
llama_add_test(test-quantize.c)
|
||||
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
|
||||
@ -0,0 +1,42 @@
|
||||
#include "ggml.h"
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
|
||||
int main(void) {
|
||||
#define QK 32
|
||||
float src[QK];
|
||||
uint8_t dst[24];
|
||||
int64_t hist[16];
|
||||
|
||||
for (int i = 0; i < QK; i++) {
|
||||
src[i] = (float)(i + 1);
|
||||
}
|
||||
|
||||
size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist);
|
||||
assert(size == 20);
|
||||
float max_result = ((float *)dst)[0];
|
||||
float max_expected = src[31] / ((1 << 3) - 1);
|
||||
assert(max_result == max_expected);
|
||||
for (int i = 0; i < QK; i++) {
|
||||
uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
|
||||
uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
|
||||
assert(q4_result == q4_expected);
|
||||
}
|
||||
|
||||
size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist);
|
||||
assert(size == 24);
|
||||
float delta_result = ((float *)dst)[0];
|
||||
float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);
|
||||
assert(delta_result == delta_expected);
|
||||
float min_result = ((float *)dst)[1];
|
||||
float min_expected = src[0];
|
||||
assert(min_result == min_expected);
|
||||
for (int i = 0; i < QK; i++) {
|
||||
uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF);
|
||||
uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected);
|
||||
assert(q4_result == q4_expected);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -0,0 +1,79 @@
|
||||
#include "utils.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <map>
|
||||
|
||||
static const std::map<std::string, std::vector<llama_token>> k_tests = {
|
||||
{ "Hello World", { 1, 10994, 2787, }, },
|
||||
{ " Hello World", { 1, 15043, 2787, }, },
|
||||
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
|
||||
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
||||
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
||||
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
|
||||
};
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const std::string fname = argv[1];
|
||||
|
||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||
|
||||
llama_context * ctx;
|
||||
|
||||
// load the vocab
|
||||
{
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.vocab_only = true;
|
||||
|
||||
ctx = llama_init_from_file(fname.c_str(), lparams);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
if (n_vocab != 32000) {
|
||||
fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
|
||||
return 2;
|
||||
}
|
||||
|
||||
for (const auto & test_kv : k_tests) {
|
||||
const auto res = ::llama_tokenize(ctx, test_kv.first, true);
|
||||
|
||||
bool correct = res.size() == test_kv.second.size();
|
||||
|
||||
for (int i = 0; i < (int) res.size() && correct; ++i) {
|
||||
if (res[i] != test_kv.second[i]) {
|
||||
correct = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!correct) {
|
||||
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
||||
fprintf(stderr, "%s : expected tokens: ", __func__);
|
||||
for (const auto & t : test_kv.second) {
|
||||
fprintf(stderr, "%6d, ", t);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s : got tokens: ", __func__);
|
||||
for (const auto & t : res) {
|
||||
fprintf(stderr, "%6d, ", t);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
Loading…
Reference in New Issue