From de5bf5bf68117b78fe60c3c727a47108382959db Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Sun, 21 May 2023 05:20:56 -0600
Subject: [PATCH 1/5] Some improvements to loading the session with
 --prompt-cache

1. Currently the --seed parameter is ignored when loading the prompt. However, a very common use case would be to save a prompt and then try several attempts at generation with different seeds.
2. When loading a cached prompt from a session, you have to specify the prompt again. Even worse, if you forget to enter a prompt you'll get your cached prompt overwritten by the blank one.
---
 examples/main/main.cpp | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 47b418d972bbc..69507cbc023a8 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -134,8 +134,6 @@ int main(int argc, char ** argv) {
         return 0;
     }
 
-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    params.prompt.insert(0, 1, ' ');
 
     std::string path_session = params.path_prompt_cache;
     std::vector<llama_token> session_tokens;
@@ -155,6 +153,9 @@ int main(int argc, char ** argv) {
                 return 1;
             }
             session_tokens.resize(n_token_count_out);
+            if (params.seed != -1) {
+                llama_set_rng_seed(ctx, params.seed);
+            }
 
             fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
         } else {
@@ -163,7 +164,16 @@ int main(int argc, char ** argv) {
     }
 
     // tokenize the prompt
-    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> embd_inp;
+
+    if (params.prompt.size() > 0 || session_tokens.size() == 0) {
+        // Add a space in front of the first character to match OG llama tokenizer behavior
+        params.prompt.insert(0, 1, ' ');
+
+        embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    } else {
+        embd_inp = session_tokens;
+    }
 
     const int n_ctx = llama_n_ctx(ctx);
 

From 2d79928982fdb1c7adaf996b7abdcef7fcaedeba Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Sun, 21 May 2023 05:38:01 -0600
Subject: [PATCH 2/5] Apply clang suggestions.

---
 examples/main/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 69507cbc023a8..c05a49dd3c3c0 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -166,7 +166,7 @@ int main(int argc, char ** argv) {
     // tokenize the prompt
     std::vector<llama_token> embd_inp;
 
-    if (params.prompt.size() > 0 || session_tokens.size() == 0) {
+    if (!params.prompt.empty() || session_tokens.empty()) {
         // Add a space in front of the first character to match OG llama tokenizer behavior
         params.prompt.insert(0, 1, ' ');
 

From e1ec489ef2f1e90650f0ffeb4c2ac234df541d09 Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Tue, 23 May 2023 07:05:44 -0600
Subject: [PATCH 3/5] Use existing session behavior when in instruct or
 interact first mode

---
 examples/main/main.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index c05a49dd3c3c0..868dc03a2c14f 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -166,7 +166,7 @@ int main(int argc, char ** argv) {
     // tokenize the prompt
     std::vector<llama_token> embd_inp;
 
-    if (!params.prompt.empty() || session_tokens.empty()) {
+    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
         // Add a space in front of the first character to match OG llama tokenizer behavior
         params.prompt.insert(0, 1, ' ');
 
@@ -191,7 +191,9 @@ int main(int argc, char ** argv) {
             }
             n_matching_session_tokens++;
         }
-        if (n_matching_session_tokens >= embd_inp.size()) {
+        if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
+            fprintf(stderr, "%s: using full prompt from session file\n", __func__);
+        } else if (n_matching_session_tokens >= embd_inp.size()) {
             fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
         } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
             fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",

From 76c73987bfee43e922ed384eeea4d7d0e7817412 Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Wed, 24 May 2023 02:55:30 -0600
Subject: [PATCH 4/5] Use the initial value of params.seed to determine if user
 supplied seed

Display some helpful information to the user when loading a session to make it clear when the seed applies or not.
---
 examples/main/main.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 868dc03a2c14f..eeb20162a3a7e 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -85,6 +85,9 @@ int main(int argc, char ** argv) {
 
     fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
 
+    // Save the initial seed parameter before overwriting it so it's possible to determine whether
+    // the user supplied a seed or not. This is useful when loading saved sessions.
+    int32_t initial_seed = params.seed;
     if (params.seed < 0) {
         params.seed = time(NULL);
     }
@@ -153,8 +156,11 @@ int main(int argc, char ** argv) {
                 return 1;
             }
             session_tokens.resize(n_token_count_out);
-            if (params.seed != -1) {
+            if (initial_seed != -1) {
+                fprintf(stderr, "%s: seed argument overrides session file RNG state, will now use seed: %d\n", __func__, params.seed);
                 llama_set_rng_seed(ctx, params.seed);
+            } else {
+                fprintf(stderr, "%s: using RNG state from loaded session file rather than seed\n", __func__);
             }
 
             fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());

From 156d70b82b607102876603f6902209460936a2f7 Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Thu, 25 May 2023 00:00:54 -0600
Subject: [PATCH 5/5] Always set RNG seed when restoring cached prompt in main
 example.

Add a note in the main example README about how restoring a prompt doesn't imply restoring the exact session state.
---
 examples/main/README.md |  2 +-
 examples/main/main.cpp  | 10 +---------
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/examples/main/README.md b/examples/main/README.md
index 7c03f92c897d9..e71ba617366a9 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -272,7 +272,7 @@ These options help improve the performance and memory usage of the LLaMA models.
 
 ### Prompt Caching
 
--   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs.
+-   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
 
 ### Quantization
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index eeb20162a3a7e..c7c591537419c 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -85,9 +85,6 @@ int main(int argc, char ** argv) {
 
     fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
 
-    // Save the initial seed parameter before overwriting it so it's possible to determine whether
-    // the user supplied a seed or not. This is useful when loading saved sessions.
-    int32_t initial_seed = params.seed;
     if (params.seed < 0) {
         params.seed = time(NULL);
     }
@@ -156,12 +153,7 @@ int main(int argc, char ** argv) {
                 return 1;
             }
             session_tokens.resize(n_token_count_out);
-            if (initial_seed != -1) {
-                fprintf(stderr, "%s: seed argument overrides session file RNG state, will now use seed: %d\n", __func__, params.seed);
-                llama_set_rng_seed(ctx, params.seed);
-            } else {
-                fprintf(stderr, "%s: using RNG state from loaded session file rather than seed\n", __func__);
-            }
+            llama_set_rng_seed(ctx, params.seed);
 
             fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
         } else {