Compare commits

...

8 Commits

Author SHA1 Message Date
Georgi Gerganov b6b268d441
Add link to Roadmap discussion 3 years ago
Georgi Gerganov 3cd8dde0d1 Revert "Fix memory allocation issues and seg faults"
This reverts commit 4870e455b3.

Will provide the correct fix later
3 years ago
Georgi Gerganov 4870e455b3
Fix memory allocation issues and seg faults 3 years ago
Georgi Gerganov 483bab2e3d
Avoid the transposed X branch in the Z = X * Y matrix multiplication (#439)
Should make results reproducible for different number of threads and batch sizes
3 years ago
Jed Fox 404e1da38e
Fix quantize script not finding models in parent directory (#428) 3 years ago
Georgi Gerganov 4cc053b6d5
Remove oboslete command from Docker script 3 years ago
Georgi Gerganov 0ba5a3a9a5
Obsolete 3 years ago
rabidcopy 2e17dfd80a
Replace EOS with newline to prevent context/memory being flushed by EOS in interactive mode (#333)
* Improve interactive mode's coherence after EOS

Aims to improve coherence and ability to resume the interactive session when the user is given input back after an end of text token is reached.
Not sure what token 13 is or why it seems to help. See conversation for examples.

* Make newline token a constant

* dynamically determine newline token

* relocate previous newline token const

* cleanup whitespace

* print a new line on end of text in interactive

this may need to be looked into further when not using a reverse prompt

* only print manual newline with reverse prompt

fix formatting of reverse prompts so they don't end up at the end of the current line while not introducing unnecessary new lines otherwise

* alternate approach to replace end of text tokens

* Inject the reverse prompt again after eos in interactive mode

* tokenize reverse prompt when needed

makes this PR compatible with https://github.com/ggerganov/llama.cpp/pull/330

* tokenize and inject only first reverse prompt

thanks to tjohnman

* tokenize first reverse prompt once

* add newline token

* add newline token

* tokenize/inject reverse prompt for refactor

this doesn't seem right though

* tokenize nothing for antiprompt if no reverse

* Update main.cpp

* Update main.cpp

* tokenize and inject reverse prompt as needed

this doesn't seem to work if the reverse prompt is tokenized outside earlier on

* not needed

* remove newline token

* remove newline token

* tokenize newline token

* add space to comment

* Update main.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Slaren <2141330+slaren@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
3 years ago

@ -16,11 +16,7 @@ elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
./quantize $arg2 ./quantize $arg2
elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
./main $arg2 ./main $arg2
elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
python3 ./download-pth.py $arg2
elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
echo "Downloading model..."
python3 ./download-pth.py "$1" "$2"
echo "Converting PTH to GGML..." echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do for i in `ls $1/$2/ggml-model-f16.bin*`; do
if [ -f "${i/f16/q4_0}" ]; then if [ -f "${i/f16/q4_0}" ]; then
@ -39,8 +35,6 @@ else
echo " ex: \"/models/7B/\" 1" echo " ex: \"/models/7B/\" 1"
echo " --quantize (-q): Optimize with quantization process ggml" echo " --quantize (-q): Optimize with quantization process ggml"
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
echo " --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/" echo " --all-in-one (-a): Execute --convert & --quantize"
echo " ex: \"/models/\" 7B"
echo " --all-in-one (-a): Execute --download, --convert & --quantize"
echo " ex: \"/models/\" 7B" echo " ex: \"/models/\" 7B"
fi fi

@ -7,8 +7,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
**Hot topics:** **Hot topics:**
- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
- New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370 - New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370
- [Added Alpaca support](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64 - Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105 - Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105

@ -1,66 +0,0 @@
import os
import sys
from tqdm import tqdm
import requests
if len(sys.argv) < 3:
print("Usage: download-pth.py dir-model model-type\n")
print(" model-type: Available models 7B, 13B, 30B or 65B")
sys.exit(1)
modelsDir = sys.argv[1]
model = sys.argv[2]
num = {
"7B": 1,
"13B": 2,
"30B": 4,
"65B": 8,
}
if model not in num:
print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
sys.exit(1)
print(f"Downloading model {model}")
files = ["checklist.chk", "params.json"]
for i in range(num[model]):
files.append(f"consolidated.0{i}.pth")
resolved_path = os.path.abspath(os.path.join(modelsDir, model))
os.makedirs(resolved_path, exist_ok=True)
for file in files:
dest_path = os.path.join(resolved_path, file)
if os.path.exists(dest_path):
print(f"Skip file download, it already exists: {file}")
continue
url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
response = requests.get(url, stream=True)
with open(dest_path, 'wb') as f:
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
t.update(len(chunk))
files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
for file in files2:
dest_path = os.path.join(modelsDir, file)
if os.path.exists(dest_path):
print(f"Skip file download, it already exists: {file}")
continue
url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
response = requests.get(url, stream=True)
with open(dest_path, 'wb') as f:
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
t.update(len(chunk))

@ -727,11 +727,13 @@ static bool llama_eval_internal(
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
struct ggml_tensor * V_trans = struct ggml_tensor * V_trans =
ggml_permute(ctx0, ggml_cpy(ctx0,
ggml_reshape_3d(ctx0, ggml_permute(ctx0,
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), ggml_reshape_3d(ctx0,
n_embd/n_head, n_head, n_past + N), ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
1, 2, 0, 3); n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
// KQV = transpose(V) * KQ_soft_max // KQV = transpose(V) * KQ_soft_max
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);

@ -258,6 +258,9 @@ int main(int argc, char ** argv) {
params.interactive = true; params.interactive = true;
} }
// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
@ -359,6 +362,16 @@ int main(int argc, char ** argv) {
last_n_tokens.push_back(id); last_n_tokens.push_back(id);
} }
// replace end of text token with newline token when in interactive mode
if (id == llama_token_eos() && params.interactive) {
id = llama_token_newline.front();
if (params.antiprompt.size() != 0) {
// tokenize and inject first reverse prompt
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
}
}
// add it to the context // add it to the context
embd.push_back(id); embd.push_back(id);
@ -451,12 +464,8 @@ int main(int argc, char ** argv) {
// end of text token // end of text token
if (embd.back() == llama_token_eos()) { if (embd.back() == llama_token_eos()) {
if (params.interactive) { fprintf(stderr, " [end of text]\n");
is_interacting = true; break;
} else {
fprintf(stderr, " [end of text]\n");
break;
}
} }
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached. // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.

@ -57,6 +57,7 @@ def main():
# ) # )
args = parser.parse_args() args = parser.parse_args()
args.models_path = os.path.abspath(args.models_path)
if not os.path.isfile(args.quantize_script_path): if not os.path.isfile(args.quantize_script_path):
print( print(

Loading…
Cancel
Save