11 changed files with 118 additions and 150 deletions
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -16,7 +16,11 @@ elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
    ./quantize $arg2
 elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
    ./main $arg2
+elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
+    python3 ./download-pth.py $arg2
 elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
+    echo "Downloading model..."
+    python3 ./download-pth.py "$1" "$2"
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
        if [ -f "${i/f16/q4_0}" ]; then
@ -35,6 +39,8 @@ else
    echo "              ex: \"/models/7B/\" 1"
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
-    echo "  --all-in-one (-a): Execute --convert & --quantize"
+    echo "  --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
+    echo "              ex: \"/models/\" 7B"
+    echo "  --all-in-one (-a): Execute --download, --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
 fi
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -218,9 +218,6 @@ add_library(utils OBJECT
 target_include_directories(utils PUBLIC .)
 target_compile_features(utils PUBLIC cxx_std_11) # don't bump
 target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
-if (BUILD_SHARED_LIBS)
-    set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON)
-endif()

 add_library(ggml OBJECT
            ggml.c
@ -229,9 +226,6 @@ add_library(ggml OBJECT
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
-if (BUILD_SHARED_LIBS)
-    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
-endif()

 add_library(llama
            llama.cpp
@ -240,10 +234,6 @@ add_library(llama
 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
 target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
-if (BUILD_SHARED_LIBS)
-    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
-endif()

 #
 # Executables
--- a/README.md
+++ b/README.md
@ -7,8 +7,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 **Hot topics:**

- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
 - New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370
+- [Added Alpaca support](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
 - Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
 - Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105

@ -220,7 +220,7 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
 ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data

 * The LLaMA models are officially distributed by Facebook and will never be provided through this repository. See this [pull request in Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to obtain access to the model data.
-* Please verify the sha256 checksums of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
+* Please verify the sha256 checksums of all of your `consolidated*.pth` and corresponding converted `ggml-model-*.bin` model files to confirm that you have the correct model data files before creating an issue relating to your model files.
 * The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:

  `sha256sum --ignore-missing -c SHA256SUMS` on Linux
--- a/20
+++ b/20
@ -1,20 +0,0 @@
-700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
-7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
-745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
-d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
-4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
-e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
-4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
-24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
-1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
-2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
-135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
-9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde  models/65B/consolidated.01.pth
-e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770  models/65B/consolidated.02.pth
-73176ffb426b40482f2aa67ae1217ef79fbbd1fff5482bae5060cdc5a24ab70e  models/65B/consolidated.03.pth
-882e6431d0b08a8bc66261a0d3607da21cbaeafa96a24e7e59777632dbdac225  models/65B/consolidated.04.pth
-a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/consolidated.05.pth
-72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
-d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
-999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
-9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347  models/tokenizer.model
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@ -36,8 +36,7 @@ fname_out = sys.argv[3]

 fout = open(fname_out, "wb")

-fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
-fout.write(struct.pack("i", 1)) # file version
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
 fout.write(struct.pack("i", n_vocab))
 fout.write(struct.pack("i", n_embd))
 fout.write(struct.pack("i", n_mult))
@ -50,21 +49,27 @@ fout.write(struct.pack("i", 4))
 # This loop unchanged from convert-pth-to-ggml.py:
 for i in range(tokenizer.vocab_size()):
    if tokenizer.is_unknown(i):
+        # "<unk>" token (translated as ??)
        text = " \u2047 ".encode("utf-8")
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
    elif tokenizer.is_control(i):
-        text = b""
+        # "<s>"/"</s>" tokens
+        fout.write(struct.pack("i", 0))
    elif tokenizer.is_byte(i):
+        # "<U+XX>" tokens (which may be invalid UTF-8)
        piece = tokenizer.id_to_piece(i)
        if len(piece) != 6:
-            print(f"Invalid token: {piece}")
+            print("Invalid token: " + piece)
            sys.exit(1)
        byte_value = int(piece[3:-1], 16)
-        text = struct.pack("B", byte_value)
+        fout.write(struct.pack("i", 1))
+        fout.write(struct.pack("B", byte_value))
    else:
+        # normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-    fout.write(struct.pack("f", tokenizer.get_score(i)))
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)

 def write_header(shape, dst_name, ftype_cur):
    sname = dst_name.encode('utf-8')
--- a/download-pth.py
+++ b/download-pth.py
@ -0,0 +1,66 @@
+import os
+import sys
+from tqdm import tqdm
+import requests
+
+if len(sys.argv) < 3:
+    print("Usage: download-pth.py dir-model model-type\n")
+    print("  model-type: Available models 7B, 13B, 30B or 65B")
+    sys.exit(1)
+
+modelsDir = sys.argv[1]
+model = sys.argv[2]
+
+num = {
+    "7B": 1,
+    "13B": 2,
+    "30B": 4,
+    "65B": 8,
+}
+
+if model not in num:
+    print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
+    sys.exit(1)
+
+print(f"Downloading model {model}")
+
+files = ["checklist.chk", "params.json"]
+
+for i in range(num[model]):
+    files.append(f"consolidated.0{i}.pth")
+
+resolved_path = os.path.abspath(os.path.join(modelsDir, model))
+os.makedirs(resolved_path, exist_ok=True)
+
+for file in files:
+    dest_path = os.path.join(resolved_path, file)
+    
+    if os.path.exists(dest_path):
+        print(f"Skip file download, it already exists: {file}")
+        continue
+
+    url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
+    response = requests.get(url, stream=True)
+    with open(dest_path, 'wb') as f:
+        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+                    t.update(len(chunk))
+
+files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
+for file in files2:
+    dest_path = os.path.join(modelsDir, file)
+    
+    if os.path.exists(dest_path):
+        print(f"Skip file download, it already exists: {file}")
+        continue
+    
+    url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
+    response = requests.get(url, stream=True)
+    with open(dest_path, 'wb') as f:
+        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+                    t.update(len(chunk))
--- a/flake.nix
+++ b/flake.nix
@ -28,8 +28,8 @@
          ];
          installPhase = ''
            mkdir -p $out/bin
-            mv bin/main $out/bin/llama
-            mv bin/quantize $out/bin/quantize
+            mv llama $out/bin/llama
+            mv quantize $out/bin/quantize
            echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
            cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
            chmod +x $out/bin/convert-pth-to-ggml
--- a/llama.cpp
+++ b/llama.cpp
@ -727,13 +727,11 @@ static bool llama_eval_internal(

            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                    ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                                n_embd/n_head, n_head, n_past + N),
-                            1, 2, 0, 3),
-                    ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        1, 2, 0, 3);

            // KQV = transpose(V) * KQ_soft_max
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
--- a/main.cpp
+++ b/main.cpp
@ -258,9 +258,6 @@ int main(int argc, char ** argv) {
        params.interactive = true;
    }

-    // determine newline token
-    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
-
    fprintf(stderr, "\n");
    fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
    fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
@ -362,16 +359,6 @@ int main(int argc, char ** argv) {
                last_n_tokens.push_back(id);
            }

-            // replace end of text token with newline token when in interactive mode
-            if (id == llama_token_eos() && params.interactive) {
-                id = llama_token_newline.front();
-                if (params.antiprompt.size() != 0) {
-                    // tokenize and inject first reverse prompt
-                    const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
-                    embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
-                }
-            }
-
            // add it to the context
            embd.push_back(id);

@ -464,8 +451,12 @@ int main(int argc, char ** argv) {

        // end of text token
        if (embd.back() == llama_token_eos()) {
-            fprintf(stderr, " [end of text]\n");
-            break;
+            if (params.interactive) {
+                is_interacting = true;
+            } else {
+                fprintf(stderr, " [end of text]\n");
+                break;
+            }
        }

        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
--- a/quantize.py
+++ b/quantize.py
@ -57,7 +57,6 @@ def main():
    # )

    args = parser.parse_args()
-    args.models_path = os.path.abspath(args.models_path)

    if not os.path.isfile(args.quantize_script_path):
        print(
--- a/utils.cpp
+++ b/utils.cpp
@ -26,95 +26,41 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
    }

-    bool invalid_param = false;
-    std::string arg;
    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
+        std::string arg = argv[i];

        if (arg == "-s" || arg == "--seed") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.seed = std::stoi(argv[i]);
+            params.seed = std::stoi(argv[++i]);
        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads = std::stoi(argv[i]);
+            params.n_threads = std::stoi(argv[++i]);
        } else if (arg == "-p" || arg == "--prompt") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.prompt = argv[i];
+            params.prompt = argv[++i];
        } else if (arg == "-f" || arg == "--file") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::ifstream file(argv[i]);
+            std::ifstream file(argv[++i]);
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
            if (params.prompt.back() == '\n') {
                params.prompt.pop_back();
            }
        } else if (arg == "-n" || arg == "--n_predict") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_predict = std::stoi(argv[i]);
+            params.n_predict = std::stoi(argv[++i]);
        } else if (arg == "--top_k") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.top_k = std::stoi(argv[i]);
+            params.top_k = std::stoi(argv[++i]);
        } else if (arg == "-c" || arg == "--ctx_size") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_ctx = std::stoi(argv[i]);
+            params.n_ctx = std::stoi(argv[++i]);
        } else if (arg == "--memory_f16") {
            params.memory_f16 = true;
        } else if (arg == "--top_p") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.top_p = std::stof(argv[i]);
+            params.top_p = std::stof(argv[++i]);
        } else if (arg == "--temp") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.temp = std::stof(argv[i]);
+            params.temp = std::stof(argv[++i]);
        } else if (arg == "--repeat_last_n") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.repeat_last_n = std::stoi(argv[i]);
+            params.repeat_last_n = std::stoi(argv[++i]);
        } else if (arg == "--repeat_penalty") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.repeat_penalty = std::stof(argv[i]);
+            params.repeat_penalty = std::stof(argv[++i]);
        } else if (arg == "-b" || arg == "--batch_size") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_batch = std::stoi(argv[i]);
+            params.n_batch = std::stoi(argv[++i]);
        } else if (arg == "-m" || arg == "--model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.model = argv[i];
+            params.model = argv[++i];
        } else if (arg == "-i" || arg == "--interactive") {
            params.interactive = true;
        } else if (arg == "--interactive-first") {
@ -124,21 +70,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        } else if (arg == "--color") {
            params.use_color = true;
        } else if (arg == "-r" || arg == "--reverse-prompt") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.antiprompt.push_back(argv[i]);
+            params.antiprompt.push_back(argv[++i]);
        } else if (arg == "--perplexity") {
            params.perplexity = true;
        } else if (arg == "--ignore-eos") {
            params.ignore_eos = true;
        } else if (arg == "--n_parts") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_parts = std::stoi(argv[i]);
+            params.n_parts = std::stoi(argv[++i]);
        } else if (arg == "-h" || arg == "--help") {
            gpt_print_usage(argc, argv, params);
            exit(0);
@ -147,14 +85,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            gpt_print_usage(argc, argv, params);
-            exit(1);
+            exit(0);
        }
    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        gpt_print_usage(argc, argv, params);
-        exit(1);
-    }

    return true;
 }