Set max_tokens parameters to 4000

It avoids truncated answers. Also reordered the imports.
simonw · Sep 28, 2023 · f85d51e · f85d51e
1 parent eabfe2f
commit f85d51e
Showing 1 changed file with 8 additions and 6 deletions.
diff --git a/llm_llama_cpp.py b/llm_llama_cpp.py
@@ -1,17 +1,19 @@
-import click
-import httpx
 import io
 import json
-import llm
 import os
 import pathlib
 import sys
 
+import click
+import httpx
+import llm
+
 try:
-    from pydantic import field_validator, Field  # type: ignore
+    from pydantic import Field, field_validator  # type: ignore
 except ImportError:
+    from pydantic.class_validators import \
+        validator as field_validator  # type: ignore [no-redef]
     from pydantic.fields import Field
-    from pydantic.class_validators import validator as field_validator  # type: ignore [no-redef]
 
 try:
     from llama_cpp import Llama
@@ -254,7 +256,7 @@ def execute(self, prompt, stream, response, conversation):
                 response._prompt_json = {"prompt_bits": prompt_bits}
             else:
                 prompt_text = prompt.prompt
-            stream = llm_model(prompt_text, stream=True)
+            stream = llm_model(prompt_text, stream=True, max_tokens=4000)
             for item in stream:
                 # Each item looks like this:
                 # {'id': 'cmpl-00...', 'object': 'text_completion', 'created': .., 'model': '/path', 'choices': [