5 changed files with 79 additions and 10 deletions
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -16,7 +16,11 @@ elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
    ./quantize $arg2
 elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
    ./main $arg2
 elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
    python3 ./download-pth.py $arg2
 elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
    echo "Downloading model..."
    python3 ./download-pth.py "$1" "$2"
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
        if [ -f "${i/f16/q4_0}" ]; then
@ -35,6 +39,8 @@ else
    echo "              ex: \"/models/7B/\" 1"
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
-    echo "  --all-in-one (-a): Execute --convert & --quantize"
+    echo "  --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
    echo "              ex: \"/models/\" 7B"
    echo "  --all-in-one (-a): Execute --download, --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
 fi
--- a/README.md
+++ b/README.md
@ -7,8 +7,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 **Hot topics:**
 - [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
 - New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370
 - [Added Alpaca support](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
 - Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
 - Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
--- a/download-pth.py
+++ b/download-pth.py
@ -0,0 +1,66 @@
 import os
 import sys
 from tqdm import tqdm
 import requests
 if len(sys.argv) < 3:
    print("Usage: download-pth.py dir-model model-type\n")
    print("  model-type: Available models 7B, 13B, 30B or 65B")
    sys.exit(1)
 modelsDir = sys.argv[1]
 model = sys.argv[2]
 num = {
    "7B": 1,
    "13B": 2,
    "30B": 4,
    "65B": 8,
 }
 if model not in num:
    print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
    sys.exit(1)
 print(f"Downloading model {model}")
 files = ["checklist.chk", "params.json"]
 for i in range(num[model]):
    files.append(f"consolidated.0{i}.pth")
 resolved_path = os.path.abspath(os.path.join(modelsDir, model))
 os.makedirs(resolved_path, exist_ok=True)
 for file in files:
    dest_path = os.path.join(resolved_path, file)
    if os.path.exists(dest_path):
        print(f"Skip file download, it already exists: {file}")
        continue
    url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
    response = requests.get(url, stream=True)
    with open(dest_path, 'wb') as f:
        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    t.update(len(chunk))
 files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
 for file in files2:
    dest_path = os.path.join(modelsDir, file)
    if os.path.exists(dest_path):
        print(f"Skip file download, it already exists: {file}")
        continue
    url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
    response = requests.get(url, stream=True)
    with open(dest_path, 'wb') as f:
        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    t.update(len(chunk))
--- a/llama.cpp
+++ b/llama.cpp
@ -727,13 +727,11 @@ static bool llama_eval_internal(
            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
+                ggml_permute(ctx0,
-                    ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
-                            ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
-                                n_embd/n_head, n_head, n_past + N),
+                        1, 2, 0, 3);
                            1, 2, 0, 3),
                    ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
            // KQV = transpose(V) * KQ_soft_max
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
--- a/quantize.py
+++ b/quantize.py
@ -57,7 +57,6 @@ def main():
    # )
    args = parser.parse_args()
    args.models_path = os.path.abspath(args.models_path)
    if not os.path.isfile(args.quantize_script_path):
        print(