Local III EasyOCR

OpenInterpreter · Jun 20, 2024 · 87cc2f3 · 87cc2f3
1 parent e05265d
commit 87cc2f3
Show file tree

Hide file tree

Showing 10 changed files with 463 additions and 64 deletions.
diff --git a/interpreter/core/computer/utils/computer_vision.py b/interpreter/core/computer/utils/computer_vision.py
@@ -9,14 +9,17 @@
 except:
     cv2 = None  # Fixes colab error
 PIL = lazy_import("PIL")
-# pytesseract is very very optional, we don't even recommend it unless the api has failed
 pytesseract = lazy_import("pytesseract")
 
 
 def pytesseract_get_text(img):
-    import pytesseract
+    # List the attributes of pytesseract, which will trigger lazy loading of it
+    attributes = dir(pytesseract)
+    if pytesseract == None:
+        raise ImportError("The pytesseract module could not be imported.")
 
-    return pytesseract.image_to_string(img)
+    result = pytesseract.image_to_string(img)
+    return result
 
 
 def pytesseract_get_text_bounding_boxes(img):

diff --git a/interpreter/core/computer/vision/vision.py b/interpreter/core/computer/vision/vision.py
@@ -17,35 +17,44 @@ def __init__(self, computer):
         self.computer = computer
         self.model = None  # Will load upon first use
         self.tokenizer = None  # Will load upon first use
+        self.easyocr = None
 
-    def load(self):
-        print("\nLoading Moondream (vision)...\n")
+    def load(self, load_moondream=True, load_easyocr=True):
+        # print("Loading vision models (Moondream, EasyOCR)...\n")
 
         with contextlib.redirect_stdout(
             open(os.devnull, "w")
         ), contextlib.redirect_stderr(open(os.devnull, "w")):
-            import transformers  # Wait until we use it. Transformers can't be lazy loaded for some reason!
-
-            os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-            if self.computer.debug:
-                print(
-                    "Open Interpreter will use Moondream (tiny vision model) to describe images to the language model. Set `interpreter.llm.vision_renderer = None` to disable this behavior."
+            if self.easyocr == None and load_easyocr:
+                import easyocr
+
+                self.easyocr = easyocr.Reader(
+                    ["en"]
+                )  # this needs to run only once to load the model into memory
+
+            if self.model == None and load_moondream:
+                import transformers  # Wait until we use it. Transformers can't be lazy loaded for some reason!
+
+                os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+                if self.computer.debug:
+                    print(
+                        "Open Interpreter will use Moondream (tiny vision model) to describe images to the language model. Set `interpreter.llm.vision_renderer = None` to disable this behavior."
+                    )
+                    print(
+                        "Alternatively, you can use a vision-supporting LLM and set `interpreter.llm.supports_vision = True`."
+                    )
+                model_id = "vikhyatk/moondream2"
+                revision = "2024-04-02"
+                print("loading model")
+
+                self.model = transformers.AutoModelForCausalLM.from_pretrained(
+                    model_id, trust_remote_code=True, revision=revision
                 )
-                print(
-                    "Alternatively, you can use a vision-supporting LLM and set `interpreter.llm.supports_vision = True`."
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    model_id, revision=revision
                 )
-            model_id = "vikhyatk/moondream2"
-            revision = "2024-04-02"
-            print("loading model")
-
-            self.model = transformers.AutoModelForCausalLM.from_pretrained(
-                model_id, trust_remote_code=True, revision=revision
-            )
-            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                model_id, revision=revision
-            )
-            return True
+                return True
 
     def ocr(
         self,
@@ -99,7 +108,11 @@ def ocr(
             path = temp_file_path
 
         try:
-            return pytesseract_get_text(path)
+            if not self.easyocr:
+                self.load(load_moondream=False)
+            result = self.easyocr.readtext(path)
+            text = " ".join([item[1] for item in result])
+            return text.strip()
         except ImportError:
             print(
                 "\nTo use local vision, run `pip install 'open-interpreter[local]'`.\n"
@@ -108,7 +121,7 @@ def ocr(
 
     def query(
         self,
-        query="Describe this image.",
+        query="Describe this image. Also tell me what text is in the image, if any.",
         base_64=None,
         path=None,
         lmc=None,
@@ -119,7 +132,7 @@ def query(
         """
 
         if self.model == None and self.tokenizer == None:
-            success = self.load()
+            success = self.load(load_easyocr=False).strip()
             if not success:
                 return ""
 
@@ -149,6 +162,8 @@ def query(
 
         with contextlib.redirect_stdout(open(os.devnull, "w")):
             enc_image = self.model.encode_image(img)
-            answer = self.model.answer_question(enc_image, query, self.tokenizer)
+            answer = self.model.answer_question(
+                enc_image, query, self.tokenizer, max_length=400
+            )
 
         return answer
diff --git a/interpreter/core/llm/llm.py b/interpreter/core/llm/llm.py
@@ -48,6 +48,7 @@ def __init__(self, interpreter):
         self.api_base = None
         self.api_key = None
         self.api_version = None
+        self._is_loaded = False
 
         # Budget manager powered by LiteLLM
         self.max_budget = None
@@ -143,7 +144,7 @@ def run(self, messages):
                         img_msg["content"] = (
                             precursor
                             + image_description
-                            + "\n---\nThe image contains the following text exactly, which may or may not be relevant (if it's not relevant, ignore this): '''\n"
+                            + "\n---\nI've OCR'd the image, this is the result (this may or may not be relevant. If it's not relevant, ignore this): '''\n"
                             + ocr
                             + "\n'''"
                             + postcursor
@@ -273,7 +274,20 @@ def run(self, messages):
         else:
             yield from run_text_llm(self, params)
 
+    # If you change model, set _is_loaded to false
+    @property
+    def model(self):
+        return self._model
+
+    @model.setter
+    def model(self, value):
+        self._model = value
+        self._is_loaded = False
+
     def load(self):
+        if self._is_loaded:
+            return
+
         if self.model.startswith("ollama/"):
             # WOAH we should also hit up ollama and set max_tokens and context_window based on the LLM. I think they let u do that
 
@@ -302,7 +316,7 @@ def load(self):
                 subprocess.run(["ollama", "pull", model_name], check=True)
 
             # Send a ping, which will actually load the model
-            print(f"\nLoading {model_name}...\n")
+            # print(f"\nLoading {model_name}...\n")
 
             old_max_tokens = self.max_tokens
             self.max_tokens = 1
@@ -313,6 +327,9 @@ def load(self):
 
         # Validate LLM should be moved here!!
 
+        self._is_loaded = True
+        return
+
 
 def fixed_litellm_completions(**params):
     """

diff --git a/interpreter/core/utils/system_debug_info.py b/interpreter/core/utils/system_debug_info.py
@@ -86,9 +86,9 @@ def interpreter_info(interpreter):
         for message in interpreter.messages:
             message = message.copy()
             try:
-                if len(message["content"]) > 600:
+                if len(message["content"]) > 5000:
                     message["content"] = (
-                        message["content"][:300] + "..." + message["content"][-300:]
+                        message["content"][:800] + "..." + message["content"][-800:]
                     )
             except Exception as e:
                 print(str(e), "for message:", message)

diff --git a/interpreter/terminal_interface/magic_commands.py b/interpreter/terminal_interface/magic_commands.py
@@ -82,7 +82,10 @@ def handle_verbose(self, arguments=None):
         print("\n\nCurrent messages:\n")
         for message in self.messages:
             message = message.copy()
-            if message["type"] == "image" and message.get("format") != "path":
+            if message["type"] == "image" and message.get("format") not in [
+                "path",
+                "description",
+            ]:
                 message["content"] = (
                     message["content"][:30] + "..." + message["content"][-30:]
                 )
@@ -102,7 +105,10 @@ def handle_debug(self, arguments=None):
         print("\n\nCurrent messages:\n")
         for message in self.messages:
             message = message.copy()
-            if message["type"] == "image" and message.get("format") != "path":
+            if message["type"] == "image" and message.get("format") not in [
+                "path",
+                "description",
+            ]:
                 message["content"] = (
                     message["content"][:30] + "..." + message["content"][-30:]
                 )

diff --git a/interpreter/terminal_interface/profiles/defaults/codestral-os.py b/interpreter/terminal_interface/profiles/defaults/codestral-os.py
@@ -113,13 +113,14 @@
 interpreter.offline = True
 interpreter.os = True
 
+# Vision setup
+interpreter.computer.vision.load()
+
 # Final message
 interpreter.display_message(
     "**Warning:** In this mode, Open Interpreter will not require approval before performing actions. Be ready to close your terminal."
 )
 interpreter.display_message(
     "\n**Note:** Codestral is a relatively weak model, so OS mode is highly experimental. Try using a more powerful model for OS mode with `interpreter --os`."
 )
-interpreter.display_message(
-    "> Model set to `codestral`, experimental OS control enabled"
-)
+interpreter.display_message("> Experimental OS control enabled.")
diff --git a/interpreter/terminal_interface/profiles/defaults/codestral-vision.py b/interpreter/terminal_interface/profiles/defaults/codestral-vision.py
@@ -17,7 +17,7 @@
 User: The code you ran produced no output. Was this expected, or are we finished?
 Assistant: No further action is required; the provided snippet opens Chrome.
 
-You have access to ONE special function called `computer.vision.query(query="Describe this image.", path="image.jpg")`. This will ask a vision AI model the query, regarding the image at path. For example:
+You have access to TWO special functions called `computer.vision.query(query="Describe this image.", path="image.jpg")` (asks a vision AI model the query, regarding the image at path) and `computer.vision.ocr(path="image.jpg")` (returns text in the image at path). For example:
 
 User: Rename the images on my desktop to something more descriptive.
 Assistant: Viewing and renaming images.
@@ -53,6 +53,25 @@
 ```
 User: The code you ran produced no output. Was this expected, or are we finished?
 Assistant: We are finished.
+User: What text is in the image 'user.png' on my desktop?
+Assistant: ```python
+import os
+import string
+from pathlib import Path
+
+# Get the user's home directory in a cross-platform way
+home_dir = Path.home()
+
+# Define the path to the image
+image_path = desktop_dir / 'user.png'
+
+# Get the text in the image
+text_in_image = computer.vision.ocr(path=str(image_path))
+
+text_in_image
+```
+User: The code you ran produced this output: "29294 is the username". What does this mean?
+Assistant: The output means that the `user.png` image on your desktop contains the text "29294 is the username".
 
 NEVER use placeholders. Always specify exact paths, and use cross-platform ways of determining the desktop, documents, etc. folders.
 
@@ -65,15 +84,16 @@
 
 # LLM settings
 interpreter.llm.model = "ollama/codestral"
-interpreter.llm.load()  # Loads Ollama models
 interpreter.llm.supports_functions = False
 interpreter.llm.execution_instructions = False
 interpreter.llm.max_tokens = 1000
 interpreter.llm.context_window = 7000
+interpreter.llm.load()  # Loads Ollama models
 
 # Computer settings
 interpreter.computer.import_computer_api = True
 interpreter.computer.system_message = ""  # The default will explain how to use the full Computer API, and append this to the system message. For local models, we want more control, so we set this to "". The system message will ONLY be what's above ^
+interpreter.computer.vision.load()  # Load vision models
 
 # Misc settings
 interpreter.auto_run = False