19 changed files with 311 additions and 319 deletions
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -16,7 +16,11 @@ elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
    ./quantize $arg2
 elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
    ./main $arg2
 elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
    python3 ./download-pth.py $arg2
 elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
    echo "Downloading model..."
    python3 ./download-pth.py "$1" "$2"
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
        if [ -f "${i/f16/q4_0}" ]; then
@ -35,6 +39,8 @@ else
    echo "              ex: \"/models/7B/\" 1"
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
-    echo "  --all-in-one (-a): Execute --convert & --quantize"
+    echo "  --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
    echo "              ex: \"/models/\" 7B"
    echo "  --all-in-one (-a): Execute --download, --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
 fi
--- a/.github/ISSUE_TEMPLATE/custom.md
+++ b/.github/ISSUE_TEMPLATE/custom.md
@ -1,7 +1,7 @@
 ---
-name: Issue and enhancement template
+name: Custom issue template
-about: Used to report issues and request enhancements for llama.cpp
+about: Used to report user-related issues with the software
-title: "[User] Insert summary of your issue or enhancement.."
+title: "[User] I encountered a problem .."
 labels: ''
 assignees: ''
@ -18,11 +18,11 @@ Please answer the following questions for yourself before submitting an issue.
 # Expected Behavior
-Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do.
+Please provide a detailed written description of what you were trying to do, and what you expected `lamma.cpp` to do.
 # Current Behavior
-Please provide a detailed written description of what `llama.cpp` did, instead. 
+Please provide a detailed written description of what `lamma.cpp` did, instead. 
 # Environment and Context 
@ -44,6 +44,20 @@ $ make --version
 $ g++ --version
 ```
 # Models
 * The LLaMA models are officially distributed by Facebook and will never be provided through this repository. See this [pull request in Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to obtain access to the model data.
 * If your issue is with model conversion please verify the `sha256sum` of each of your `consolidated*.pth` and `ggml-model-XXX.bin` files to confirm that you have the correct model data files before logging an issue. [Latest sha256 sums for your reference](https://github.com/ggerganov/llama.cpp/issues/238).
 * If your issue is with model generation quality then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
  * LLaMA:
    * [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
    * [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
  * GPT-3
    * [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
  * GPT-3.5 / InstructGPT / ChatGPT:
    * [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
    * [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
 # Failure Information (for bugs)
 Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
@ -61,9 +75,8 @@ Please provide detailed steps for reproducing the issue. We are not sitting in f
 Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
-Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
+Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability. e.g.
 Example environment info:
 ```
 llama.cpp$ git log | head -1
 commit 2af23d30434a677c6416812eea52ccc0af65119c
@ -90,8 +103,8 @@ GNU Make 4.3
 $ md5sum ./models/65B/ggml-model-q4_0.bin
 dbdd682cce80e2d6e93cefc7449df487  ./models/65B/ggml-model-q4_0.bin
 ```
 Here's a run with the Linux command [perf](https://www.brendangregg.com/perf.html)
 Example run with the Linux command [perf](https://www.brendangregg.com/perf.html)
 ```
 llama.cpp$ perf stat ./main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p "Please close your issue when it has been answered."
 main: seed = 1679149377
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -41,27 +41,19 @@ jobs:
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
          cmake ..
          cmake --build . --config Release
      - name: Test
        id: cmake_test
        run: |
          cd build
          ctest --output-on-failure
  macOS-latest-make:
@ -87,26 +79,18 @@ jobs:
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
      - name: Dependencies
        id: depends
        run: |
          brew update
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
-          cmake -DLLAMA_AVX2=OFF ..
+          cmake ..
          cmake --build . --config Release
      - name: Test
        id: cmake_test
        run: |
          cd build
          ctest --output-on-failure
  windows-latest-cmake:
@ -124,11 +108,6 @@ jobs:
          cd build
          cmake ..
          cmake --build . --config Release
      - name: Test
        id: cmake_test
        run: |
          cd build
          ctest -C Release --output-on-failure
      - name: Get commit hash
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -217,10 +217,6 @@ add_library(utils OBJECT
 target_include_directories(utils PUBLIC .)
 target_compile_features(utils PUBLIC cxx_std_11) # don't bump
 target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
 if (BUILD_SHARED_LIBS)
    set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 add_library(ggml OBJECT
            ggml.c
@ -229,21 +225,13 @@ add_library(ggml OBJECT
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
 if (BUILD_SHARED_LIBS)
    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
-add_library(llama
+add_library(llama OBJECT
            llama.cpp
            llama.h)
 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
 target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
 if (BUILD_SHARED_LIBS)
    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
 endif()
 #
 # Executables
--- a/4
+++ b/4
@ -231,9 +231,7 @@ clean:
 main: main.cpp ggml.o llama.o utils.o
 	$(CXX) $(CXXFLAGS) main.cpp ggml.o llama.o utils.o -o main $(LDFLAGS)
-	@echo
+	@echo "\x1b[36mrun ./main -h for help\x1b[0m"
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 quantize: quantize.cpp ggml.o llama.o utils.o
 	$(CXX) $(CXXFLAGS) quantize.cpp ggml.o llama.o utils.o -o quantize $(LDFLAGS)
--- a/README.md
+++ b/README.md
@ -7,8 +7,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 **Hot topics:**
 - [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
 - New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370
 - [Added Alpaca support](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
 - Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
 - Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
@ -191,8 +191,17 @@ Note the use of `--color` to distinguish between user input and generated text.
 ### Instruction mode with Alpaca
-1. First, download the `ggml` Alpaca model into the `./models` folder
+First, download the `ggml` Alpaca model into the `./models` folder:
-2. Run the `main` tool like this:
+
 ```
 # use one of these
 # TODO: add a script to simplify the download
 curl -o ./models/ggml-alpaca-7b-q4.bin -C - https://gateway.estuary.tech/gw/ipfs/QmUp1UGeQFDqJKvtjbSYPBiZZKRjLp8shVP9hT8ZB9Ynv1
 curl -o ./models/ggml-alpaca-7b-q4.bin -C - https://ipfs.io/ipfs/QmUp1UGeQFDqJKvtjbSYPBiZZKRjLp8shVP9hT8ZB9Ynv1
 curl -o ./models/ggml-alpaca-7b-q4.bin -C - https://cloudflare-ipfs.com/ipfs/QmUp1UGeQFDqJKvtjbSYPBiZZKRjLp8shVP9hT8ZB9Ynv1
 ```
 Now run the `main` tool like this:
 ```
 ./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins
@ -219,9 +228,11 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
 ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
-* The LLaMA models are officially distributed by Facebook and will never be provided through this repository. See this [pull request in Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to obtain access to the model data.
+* The LLaMA models are officially distributed by Facebook and will never be provided through this repository. See this [Pull Request in Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to obtain access to the model data.
-* Please verify the sha256 checksums of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
+
-* The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:
+* Please verify the sha256 checksums of all of your `consolidated*.pth` and corresponding converted `ggml-model-*.bin` model files to confirm that you have the correct model data files before creating an issue relating to your model files.
 The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:
 `sha256sum --ignore-missing -c SHA256SUMS` on Linux
@ -229,24 +240,14 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
 `shasum -a 256 --ignore-missing -c SHA256SUMS` on macOS
 * If your issue is with model generation quality then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
  * LLaMA:
    * [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
    * [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
  * GPT-3
    * [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
  * GPT-3.5 / InstructGPT / ChatGPT:
    * [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
    * [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
 ### Perplexity (Measuring model quality)
 You can pass `--perplexity` as a command line option to measure perplexity over the given prompt.  For more background,
 see https://huggingface.co/docs/transformers/perplexity.  However, in general, lower perplexity is better for LLMs.
-#### Latest measurements
+#### Measurements
-The latest perplexity scores for the various model sizes and quantizations are being tracked in [discussion #406](https://github.com/ggerganov/llama.cpp/discussions/406).  `llama.cpp` is measuring very well
+https://github.com/ggerganov/llama.cpp/pull/270 is the unofficial tracking page for now.  llama.cpp is measuring very well
 compared to the baseline implementations.  Quantization has a small negative impact to quality, but, as you can see, running
 13B at q4_0 beats the 7B f16 model by a significant amount.
@ -336,7 +337,6 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
 - Collaborators will be invited based on contributions
 - Any help with managing issues and PRs is very appreciated!
 - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
 - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
 ### Coding guidelines
@ -346,4 +346,3 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
 - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
 - Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a`
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
--- a/35
+++ b/35
@ -1,12 +1,26 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
 abe4aec2cdc297e2916011f66c7efd6fb4424e0e84315503005b5c118358cc22  models/7B/ggml-model-f16.bin
 f495fa02a0b5ef265e1864d9680eede7fd23a60b0a2f93edba8091e2a4ca68b9  models/7B/ggml-model-q4_0.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
 a6bd0537c6873f36c47292df0b6f794e1135f5aafb89c3343bcc9e93264bf167  models/13B/ggml-model-f16.bin
 0fb0951b90f2ec46c1f2f2372af5dacb4614b27e9fb6c10c69fbec58d7dd0e36  models/13B/ggml-model-f16.bin.1
 1c218ba37ae61e15e35efd9949c78d6edf553b6280824c263cad56ae0b9d5a8f  models/13B/ggml-model-q4_0.bin
 c37a20c2ab9fa74b006b389085660269ee06110d1e45a494eb57d4602c9bcdb2  models/13B/ggml-model-q4_0.bin.1
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
 def20ea508f4e36793719f857471e85b85f96e497a2cbffbbaa1b60e2b18202c  models/30B/ggml-model-f16.bin
 b37040aa67fa8608cb2d8e0719132cf3e267fd35ec1e2f0d37dbc9fa43d674f1  models/30B/ggml-model-f16.bin.1
 e7f263557e99069fe29003262ea5fa9ed885dbe79069083e6eb569b328cf30d3  models/30B/ggml-model-f16.bin.2
 2ad6a23af05eb720f202f63d130f4fc5de9b6d2efc95b921be003209a56695aa  models/30B/ggml-model-f16.bin.3
 7de31d005e6d02ebd9603b2cf5329ad2f832b65d08873a098c5cafc4046cb9ed  models/30B/ggml-model-q4_0.bin
 f91feef9f30f9a023616db2e91297ca6d5d5d7b9eb351e452a82115c46f7da9e  models/30B/ggml-model-q4_0.bin.1
 66f3a0916ac7a81839153eb061fa861030ed1892477c2f7af2ce4f98d2f6d06f  models/30B/ggml-model-q4_0.bin.2
 e3c587ba97f83d2088b001bcda3026571065649ee3090bef6743a51390b01d3b  models/30B/ggml-model-q4_0.bin.3
 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde  models/65B/consolidated.01.pth
@ -16,5 +30,24 @@ e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770  models/65B/con
 a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/consolidated.05.pth
 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
 d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
 7eba2625260cd91f8de901fd9704a1aa39448425514a335a0d3878de4ab9dc77  models/65B/ggml-model-f16.bin
 f6aa886575df0785d4231f30cc776d499ccde18857818effc0378c65b178e0b5  models/65B/ggml-model-f16.bin.1
 076037141682f5d7537955058c4740ab27f285aa4588915f830874a589c0693d  models/65B/ggml-model-f16.bin.2
 7853d96d2903ad7de2b2a89c4acf5a33a2f8e3c24ac39c9df6b44cdb42bf530a  models/65B/ggml-model-f16.bin.3
 b16b7b941abb3bc03a14df1656140855e9360a5371c83e919b9da83a72362314  models/65B/ggml-model-f16.bin.4
 5291270216f888697695acb78ef28df0c080f9e85d3245c92fb9992d1fde6678  models/65B/ggml-model-f16.bin.5
 0685ee77715f34686841006f8f94d3e7eaf148b97cecc9d3eee72808b0f7989c  models/65B/ggml-model-f16.bin.6
 00d993d73bb21d7c29388ffe0dced008cbaa0d391831dea77d7eb8f0b5c404b9  models/65B/ggml-model-f16.bin.7
 4e398f05842206e08cdc5e7bb4f6c7c34b9dc373435ece6f261b14b7b4fe9b89  models/65B/ggml-model-q4_0.bin
 4c4e899e3b12d9f57c9dcea5a1fb41bbc72023323535551f6273582ca7d7294b  models/65B/ggml-model-q4_0.bin.1
 d7b4594bbbd192043b3db0e5acc2561c42e6944e1cb91cc6e61510eee89dbcd8  models/65B/ggml-model-q4_0.bin.2
 9a099d271648863d923d0d097391ea0bc75591f27a2ca3a327760f42e6b69af2  models/65B/ggml-model-q4_0.bin.3
 5ee474051e418c5732b7949190b084d9d679db447f83c1de0d2a82daaa1a0cfa  models/65B/ggml-model-q4_0.bin.4
 a45aa05e7212bd6782790722d68056c5419667ea6b564ccc94bbcb8111d79b8b  models/65B/ggml-model-q4_0.bin.5
 a58fda714b759c28ad5e4c1d8bf8fda7b158fd5e4c4a49f851f36342fa97a105  models/65B/ggml-model-q4_0.bin.6
 a3540cfcbcda33c223c6b0d606034adbd78f17e0e5de1582b78795e78754f7a8  models/65B/ggml-model-q4_0.bin.7
 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
-9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347  models/tokenizer.model
+1f582babc2bd56bb63b33141898748657d369fd110c4358b2bc280907882bf13  models/alpaca-7B/ggml-model-q4_0.bin
 e17730c6b62b565b098af023ca446dcb9e3535d4222ead6369c7aae67207eb3d  models/alpaca-13B/ggml-model-q4_0.bin
 9bcd1bb30e679c939f367be11b030fe20b3eb9a3606b9bc4106420f1827b6ae4  models/alpaca-30B/ggml-model-q4_0.bin
 36079249f53c292a4c2302d7784005dcae94c865f0bedfdbfa51d9ddad402935  models/alpaca-30B/params.json
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@ -36,8 +36,7 @@ fname_out = sys.argv[3]
 fout = open(fname_out, "wb")
-fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
 fout.write(struct.pack("i", 1)) # file version
 fout.write(struct.pack("i", n_vocab))
 fout.write(struct.pack("i", n_embd))
 fout.write(struct.pack("i", n_mult))
@ -50,21 +49,27 @@ fout.write(struct.pack("i", 4))
 # This loop unchanged from convert-pth-to-ggml.py:
 for i in range(tokenizer.vocab_size()):
    if tokenizer.is_unknown(i):
        # "<unk>" token (translated as ??)
        text = " \u2047 ".encode("utf-8")
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
    elif tokenizer.is_control(i):
-        text = b""
+        # "<s>"/"</s>" tokens
        fout.write(struct.pack("i", 0))
    elif tokenizer.is_byte(i):
        # "<U+XX>" tokens (which may be invalid UTF-8)
        piece = tokenizer.id_to_piece(i)
        if len(piece) != 6:
-            print(f"Invalid token: {piece}")
+            print("Invalid token: " + piece)
            sys.exit(1)
        byte_value = int(piece[3:-1], 16)
-        text = struct.pack("B", byte_value)
+        fout.write(struct.pack("i", 1))
        fout.write(struct.pack("B", byte_value))
    else:
        # normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
    fout.write(struct.pack("f", tokenizer.get_score(i)))
 def write_header(shape, dst_name, ftype_cur):
    sname = dst_name.encode('utf-8')
--- a/download-pth.py
+++ b/download-pth.py
@ -0,0 +1,66 @@
 import os
 import sys
 from tqdm import tqdm
 import requests
 if len(sys.argv) < 3:
    print("Usage: download-pth.py dir-model model-type\n")
    print("  model-type: Available models 7B, 13B, 30B or 65B")
    sys.exit(1)
 modelsDir = sys.argv[1]
 model = sys.argv[2]
 num = {
    "7B": 1,
    "13B": 2,
    "30B": 4,
    "65B": 8,
 }
 if model not in num:
    print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
    sys.exit(1)
 print(f"Downloading model {model}")
 files = ["checklist.chk", "params.json"]
 for i in range(num[model]):
    files.append(f"consolidated.0{i}.pth")
 resolved_path = os.path.abspath(os.path.join(modelsDir, model))
 os.makedirs(resolved_path, exist_ok=True)
 for file in files:
    dest_path = os.path.join(resolved_path, file)
    if os.path.exists(dest_path):
        print(f"Skip file download, it already exists: {file}")
        continue
    url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
    response = requests.get(url, stream=True)
    with open(dest_path, 'wb') as f:
        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    t.update(len(chunk))
 files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
 for file in files2:
    dest_path = os.path.join(modelsDir, file)
    if os.path.exists(dest_path):
        print(f"Skip file download, it already exists: {file}")
        continue
    url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
    response = requests.get(url, stream=True)
    with open(dest_path, 'wb') as f:
        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    t.update(len(chunk))
--- a/flake.nix
+++ b/flake.nix
@ -28,8 +28,8 @@
          ];
          installPhase = ''
            mkdir -p $out/bin
-            mv bin/main $out/bin/llama
+            mv llama $out/bin/llama
-            mv bin/quantize $out/bin/quantize
+            mv quantize $out/bin/quantize
            echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
            cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
            chmod +x $out/bin/convert-pth-to-ggml
--- a/ggml.c
+++ b/ggml.c
@ -1,6 +1,3 @@
 // Defines CLOCK_MONOTONIC on Linux
 #define _POSIX_C_SOURCE 199309L
 #include "ggml.h"
 #if defined(_MSC_VER) || defined(__MINGW32__)
@ -403,55 +400,9 @@ static inline __m128i packNibbles( __m256i bytes )
 // method 5
 // blocks of QK elements
 // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
 // reference implementation for deterministic creation of model files
 static void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) {
    assert(k % QK == 0);
    const int nb = k / QK;
    const size_t bs = sizeof(float) + QK/2;
    uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
    uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
    uint8_t pp[QK/2];
    for (int i = 0; i < nb; i++) {
        float amax = 0.0f; // absolute max
        for (int l = 0; l < QK; l++) {
            const float v = x[i*QK + l];
            amax = MAX(amax, fabsf(v));
        }
        const float d = amax / ((1 << 3) - 1);
        const float id = d ? 1.0f/d : 0.0f;
        *(float *)pd = d;
        pd += bs;
        for (int l = 0; l < QK; l += 2) {
            const float v0 = x[i*QK + l + 0]*id;
            const float v1 = x[i*QK + l + 1]*id;
            const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
            const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
            assert(vi0 >= 0 && vi0 < 16);
            assert(vi1 >= 0 && vi1 < 16);
            pp[l/2] = vi0 | (vi1 << 4);
        }
        memcpy(pb, pp, sizeof(pp));
        pb += bs;
    }
 }
 void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
    assert(k % QK == 0);
 #if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__)
    const int nb = k / QK;
    const size_t bs = sizeof(float) + QK/2;
@ -459,7 +410,6 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
    uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
    uint8_t pp[QK/2];
 #endif
 #if __ARM_NEON
 #if QK == 32
@ -616,7 +566,36 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
 #endif
 #else
    // scalar
-    quantize_row_q4_0_reference(x, y, k);
+    for (int i = 0; i < nb; i++) {
        float amax = 0.0f; // absolute max
        for (int l = 0; l < QK; l++) {
            const float v = x[i*QK + l];
            amax = MAX(amax, fabsf(v));
        }
        const float d = amax / ((1 << 3) - 1);
        const float id = d ? 1.0f/d : 0.0f;
        *(float *)pd = d;
        pd += bs;
        for (int l = 0; l < QK; l += 2) {
            const float v0 = x[i*QK + l + 0]*id;
            const float v1 = x[i*QK + l + 1]*id;
            const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
            const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
            assert(vi0 >= 0 && vi0 < 16);
            assert(vi1 >= 0 && vi1 < 16);
            pp[l/2] = vi0 | (vi1 << 4);
        }
        memcpy(pb, pp, sizeof(pp));
        pb += bs;
    }
 #endif
 }
@ -10723,62 +10702,121 @@ enum ggml_opt_result ggml_opt(
 ////////////////////////////////////////////////////////////////////////////////
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
+size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
    const int nb = k / qk;
    const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
    const size_t row_size = nb*bs;
    assert(k % qk == 0);
    const size_t pp_size = qk / 2;
    uint8_t * pp = (uint8_t *) alloca(pp_size);
    char * pdst = (char *) dst;
    for (int j = 0; j < n; j += k) {
        uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
        uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
        quantize_row_q4_0_reference(src + j, pd, k);
        for (int i = 0; i < nb; i++) {
            float amax = 0.0f; // absolute max
            {
                for (int l = 0; l < qk; l++) {
                    const float v = src[j + i*qk + l];
                    amax = MAX(amax, fabsf(v));
                }
                const float d = amax / ((1 << 3) - 1);
                const float id = d ? 1.0f/d : 0.0f;
                *(float *) pd = d;
                pd += bs;
                for (int l = 0; l < qk; l += 2) {
-                const uint8_t vi0 = pb[l/2] & 0xF;
+                    const float v0 = (src[j + i*qk + l + 0])*id;
-                const uint8_t vi1 = pb[l/2] >> 4;
+                    const float v1 = (src[j + i*qk + l + 1])*id;
                    const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
                    const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
                    assert(vi0 >= 0 && vi0 < 16);
                    assert(vi1 >= 0 && vi1 < 16);
                    hist[vi0]++;
                    hist[vi1]++;
                    pp[l/2] = vi0 | (vi1 << 4);
                }
                memcpy(pb, pp, pp_size);
                pb += bs;
            }
        }
    }
    return (n/k)*row_size;
 }
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
+size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
    const int nb = k / qk;
    const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
    const size_t row_size = nb*bs;
    assert(k % qk == 0);
    const size_t pp_size = qk / 2;
    uint8_t * pp = (uint8_t *) alloca(pp_size);
    char * pdst = (char *) dst;
    for (int j = 0; j < n; j += k) {
        uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
        uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs +   sizeof(float));
        uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
-        quantize_row_q4_1(src + j, pd, k);
+        //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
        for (int i = 0; i < nb; i++) {
            float min = FLT_MAX;
            float max = -FLT_MAX;
            {
                for (int l = 0; l < qk; l++) {
                    const float v = src[j + i*qk + l];
                    if (v < min) min = v;
                    if (v > max) max = v;
                }
                const float d = (max - min) / ((1 << 4) - 1);
                const float id = d ? 1.0f/d : 0.0f;
                *(float *) pd = d;
                *(float *) pm = min;
                pd += bs;
                pm += bs;
                for (int l = 0; l < qk; l += 2) {
-                const uint8_t vi0 = pb[l/2] & 0xF;
+                    const float v0 = (src[j + i*qk + l + 0] - min)*id;
-                const uint8_t vi1 = pb[l/2] >> 4;
+                    const float v1 = (src[j + i*qk + l + 1] - min)*id;
                    const uint8_t vi0 = round(v0);
                    const uint8_t vi1 = round(v1);
                    assert(vi0 >= 0 && vi0 < 16);
                    assert(vi1 >= 0 && vi1 < 16);
                    hist[vi0]++;
                    hist[vi1]++;
                    pp[l/2] = vi0 | (vi1 << 4);
                }
                memcpy(pb, pp, pp_size);
                pb += bs;
            }
        }
    }
    return (n/k)*row_size;
 }
--- a/ggml.h
+++ b/ggml.h
@ -745,8 +745,8 @@ enum ggml_opt_result ggml_opt(
 // quantization
 //
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
+size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
+size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
 //
 // system info
--- a/llama.cpp
+++ b/llama.cpp
@ -727,13 +727,11 @@ static bool llama_eval_internal(
            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            struct ggml_tensor * V_trans =
                ggml_cpy(ctx0,
                ggml_permute(ctx0,
                        ggml_reshape_3d(ctx0,
                            ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
                            n_embd/n_head, n_head, n_past + N),
-                            1, 2, 0, 3),
+                        1, 2, 0, 3);
                    ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
            // KQV = transpose(V) * KQ_soft_max
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
--- a/main.cpp
+++ b/main.cpp
@ -254,13 +254,6 @@ int main(int argc, char ** argv) {
        params.interactive = true;
    }
    if (params.interactive_start) {
        params.interactive = true;
    }
    // determine newline token
    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
    fprintf(stderr, "\n");
    fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
    fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
@ -303,7 +296,7 @@ int main(int argc, char ** argv) {
 #endif
               " - Press Return to return control to LLaMa.\n"
               " - If you want to submit another line, end your input in '\\'.\n\n");
-        is_interacting = params.interactive_start || params.instruct;
+        is_interacting = true;
    }
    int input_consumed = 0;
@ -362,16 +355,6 @@ int main(int argc, char ** argv) {
                last_n_tokens.push_back(id);
            }
            // replace end of text token with newline token when in interactive mode
            if (id == llama_token_eos() && params.interactive) {
                id = llama_token_newline.front();
                if (params.antiprompt.size() != 0) {
                    // tokenize and inject first reverse prompt
                    const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
                    embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                }
            }
            // add it to the context
            embd.push_back(id);
@ -464,9 +447,13 @@ int main(int argc, char ** argv) {
        // end of text token
        if (embd.back() == llama_token_eos()) {
            if (params.interactive) {
                is_interacting = true;
            } else {
                fprintf(stderr, " [end of text]\n");
                break;
            }
        }
        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
        if (params.interactive && remaining_tokens <= 0) {
--- a/quantize.py
+++ b/quantize.py
@ -57,7 +57,6 @@ def main():
    # )
    args = parser.parse_args()
    args.models_path = os.path.abspath(args.models_path)
    if not os.path.isfile(args.quantize_script_path):
        print(
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -1,9 +1,4 @@
-function(llama_add_test source)
+set(TEST_TARGET test-tokenizer-0)
-    get_filename_component(TEST_TARGET ${source} NAME_WE)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
    add_executable(${TEST_TARGET} ${source})
 target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
-    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
 endfunction()
 llama_add_test(test-quantize.c)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
--- a/tests/test-quantize.c
+++ b/tests/test-quantize.c
@ -1,42 +0,0 @@
 #include "ggml.h"
 #undef NDEBUG
 #include <assert.h>
 #include <math.h>
 int main(void) {
    #define QK 32
    float src[QK];
    uint8_t dst[24];
    int64_t hist[16];
    for (int i = 0; i < QK; i++) {
        src[i] = (float)(i + 1);
    }
    size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist);
    assert(size == 20);
    float max_result = ((float *)dst)[0];
    float max_expected = src[31] / ((1 << 3) - 1);
    assert(max_result == max_expected);
    for (int i = 0; i < QK; i++) {
        uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
        uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
        assert(q4_result == q4_expected);
    }
    size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist);
    assert(size == 24);
    float delta_result = ((float *)dst)[0];
    float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);
    assert(delta_result == delta_expected);
    float min_result = ((float *)dst)[1];
    float min_expected = src[0];
    assert(min_result == min_expected);
    for (int i = 0; i < QK; i++) {
        uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF);
        uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected);
        assert(q4_result == q4_expected);
    }
    return 0;
 }
--- a/utils.cpp
+++ b/utils.cpp
@ -26,119 +26,55 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
    }
    bool invalid_param = false;
    std::string arg;
    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
+        std::string arg = argv[i];
        if (arg == "-s" || arg == "--seed") {
-            if (++i >= argc) {
+            params.seed = std::stoi(argv[++i]);
                invalid_param = true;
                break;
            }
            params.seed = std::stoi(argv[i]);
        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
+            params.n_threads = std::stoi(argv[++i]);
                invalid_param = true;
                break;
            }
            params.n_threads = std::stoi(argv[i]);
        } else if (arg == "-p" || arg == "--prompt") {
-            if (++i >= argc) {
+            params.prompt = argv[++i];
                invalid_param = true;
                break;
            }
            params.prompt = argv[i];
        } else if (arg == "-f" || arg == "--file") {
-            if (++i >= argc) {
+            std::ifstream file(argv[++i]);
                invalid_param = true;
                break;
            }
            std::ifstream file(argv[i]);
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
            if (params.prompt.back() == '\n') {
                params.prompt.pop_back();
            }
        } else if (arg == "-n" || arg == "--n_predict") {
-            if (++i >= argc) {
+            params.n_predict = std::stoi(argv[++i]);
                invalid_param = true;
                break;
            }
            params.n_predict = std::stoi(argv[i]);
        } else if (arg == "--top_k") {
-            if (++i >= argc) {
+            params.top_k = std::stoi(argv[++i]);
                invalid_param = true;
                break;
            }
            params.top_k = std::stoi(argv[i]);
        } else if (arg == "-c" || arg == "--ctx_size") {
-            if (++i >= argc) {
+            params.n_ctx = std::stoi(argv[++i]);
                invalid_param = true;
                break;
            }
            params.n_ctx = std::stoi(argv[i]);
        } else if (arg == "--memory_f16") {
            params.memory_f16 = true;
        } else if (arg == "--top_p") {
-            if (++i >= argc) {
+            params.top_p = std::stof(argv[++i]);
                invalid_param = true;
                break;
            }
            params.top_p = std::stof(argv[i]);
        } else if (arg == "--temp") {
-            if (++i >= argc) {
+            params.temp = std::stof(argv[++i]);
                invalid_param = true;
                break;
            }
            params.temp = std::stof(argv[i]);
        } else if (arg == "--repeat_last_n") {
-            if (++i >= argc) {
+            params.repeat_last_n = std::stoi(argv[++i]);
                invalid_param = true;
                break;
            }
            params.repeat_last_n = std::stoi(argv[i]);
        } else if (arg == "--repeat_penalty") {
-            if (++i >= argc) {
+            params.repeat_penalty = std::stof(argv[++i]);
                invalid_param = true;
                break;
            }
            params.repeat_penalty = std::stof(argv[i]);
        } else if (arg == "-b" || arg == "--batch_size") {
-            if (++i >= argc) {
+            params.n_batch = std::stoi(argv[++i]);
                invalid_param = true;
                break;
            }
            params.n_batch = std::stoi(argv[i]);
        } else if (arg == "-m" || arg == "--model") {
-            if (++i >= argc) {
+            params.model = argv[++i];
                invalid_param = true;
                break;
            }
            params.model = argv[i];
        } else if (arg == "-i" || arg == "--interactive") {
            params.interactive = true;
        } else if (arg == "--interactive-first") {
            params.interactive_start = true;
        } else if (arg == "-ins" || arg == "--instruct") {
            params.instruct = true;
        } else if (arg == "--color") {
            params.use_color = true;
        } else if (arg == "-r" || arg == "--reverse-prompt") {
-            if (++i >= argc) {
+            params.antiprompt.push_back(argv[++i]);
                invalid_param = true;
                break;
            }
            params.antiprompt.push_back(argv[i]);
        } else if (arg == "--perplexity") {
            params.perplexity = true;
        } else if (arg == "--ignore-eos") {
            params.ignore_eos = true;
        } else if (arg == "--n_parts") {
-            if (++i >= argc) {
+            params.n_parts = std::stoi(argv[++i]);
                invalid_param = true;
                break;
            }
            params.n_parts = std::stoi(argv[i]);
        } else if (arg == "-h" || arg == "--help") {
            gpt_print_usage(argc, argv, params);
            exit(0);
@ -147,13 +83,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            gpt_print_usage(argc, argv, params);
-            exit(1);
+            exit(0);
        }
        }
    if (invalid_param) {
        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
        gpt_print_usage(argc, argv, params);
        exit(1);
    }
    return true;
@ -165,10 +96,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h, --help            show this help message and exit\n");
    fprintf(stderr, "  -i, --interactive     run in interactive mode\n");
    fprintf(stderr, "  --interactive-first   run in interactive mode and wait for input right away\n");
    fprintf(stderr, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
    fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
-    fprintf(stderr, "                        run in interactive mode and poll user input upon seeing PROMPT (can be\n");
+    fprintf(stderr, "                        in interactive mode, poll user input upon seeing PROMPT (can be\n");
    fprintf(stderr, "                        specified more than once for multiple prompts).\n");
    fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n");
    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for <= 0)\n");
--- a/utils.h
+++ b/utils.h
@ -38,7 +38,7 @@ struct gpt_params {
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
-    bool interactive_start = false; // wait for user input immediately
+    bool interactive_start = false; // reverse prompt immediately
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool ignore_eos        = false; // do not stop generating after eos
    bool perplexity        = false; // compute perplexity over the prompt