Skip to content

Commit

Permalink
Local III EasyOCR
Browse files Browse the repository at this point in the history
  • Loading branch information
KillianLucas committed Jun 20, 2024
1 parent e05265d commit 87cc2f3
Show file tree
Hide file tree
Showing 10 changed files with 463 additions and 64 deletions.
9 changes: 6 additions & 3 deletions interpreter/core/computer/utils/computer_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,17 @@
except:
cv2 = None # Fixes colab error
PIL = lazy_import("PIL")
# pytesseract is very very optional, we don't even recommend it unless the api has failed
pytesseract = lazy_import("pytesseract")


def pytesseract_get_text(img):
import pytesseract
# List the attributes of pytesseract, which will trigger lazy loading of it
attributes = dir(pytesseract)
if pytesseract == None:
raise ImportError("The pytesseract module could not be imported.")

return pytesseract.image_to_string(img)
result = pytesseract.image_to_string(img)
return result


def pytesseract_get_text_bounding_boxes(img):
Expand Down
67 changes: 41 additions & 26 deletions interpreter/core/computer/vision/vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,35 +17,44 @@ def __init__(self, computer):
self.computer = computer
self.model = None # Will load upon first use
self.tokenizer = None # Will load upon first use
self.easyocr = None

def load(self):
print("\nLoading Moondream (vision)...\n")
def load(self, load_moondream=True, load_easyocr=True):
# print("Loading vision models (Moondream, EasyOCR)...\n")

with contextlib.redirect_stdout(
open(os.devnull, "w")
), contextlib.redirect_stderr(open(os.devnull, "w")):
import transformers # Wait until we use it. Transformers can't be lazy loaded for some reason!

os.environ["TOKENIZERS_PARALLELISM"] = "false"

if self.computer.debug:
print(
"Open Interpreter will use Moondream (tiny vision model) to describe images to the language model. Set `interpreter.llm.vision_renderer = None` to disable this behavior."
if self.easyocr == None and load_easyocr:
import easyocr

self.easyocr = easyocr.Reader(
["en"]
) # this needs to run only once to load the model into memory

if self.model == None and load_moondream:
import transformers # Wait until we use it. Transformers can't be lazy loaded for some reason!

os.environ["TOKENIZERS_PARALLELISM"] = "false"

if self.computer.debug:
print(
"Open Interpreter will use Moondream (tiny vision model) to describe images to the language model. Set `interpreter.llm.vision_renderer = None` to disable this behavior."
)
print(
"Alternatively, you can use a vision-supporting LLM and set `interpreter.llm.supports_vision = True`."
)
model_id = "vikhyatk/moondream2"
revision = "2024-04-02"
print("loading model")

self.model = transformers.AutoModelForCausalLM.from_pretrained(
model_id, trust_remote_code=True, revision=revision
)
print(
"Alternatively, you can use a vision-supporting LLM and set `interpreter.llm.supports_vision = True`."
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
model_id, revision=revision
)
model_id = "vikhyatk/moondream2"
revision = "2024-04-02"
print("loading model")

self.model = transformers.AutoModelForCausalLM.from_pretrained(
model_id, trust_remote_code=True, revision=revision
)
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
model_id, revision=revision
)
return True
return True

def ocr(
self,
Expand Down Expand Up @@ -99,7 +108,11 @@ def ocr(
path = temp_file_path

try:
return pytesseract_get_text(path)
if not self.easyocr:
self.load(load_moondream=False)
result = self.easyocr.readtext(path)
text = " ".join([item[1] for item in result])
return text.strip()
except ImportError:
print(
"\nTo use local vision, run `pip install 'open-interpreter[local]'`.\n"
Expand All @@ -108,7 +121,7 @@ def ocr(

def query(
self,
query="Describe this image.",
query="Describe this image. Also tell me what text is in the image, if any.",
base_64=None,
path=None,
lmc=None,
Expand All @@ -119,7 +132,7 @@ def query(
"""

if self.model == None and self.tokenizer == None:
success = self.load()
success = self.load(load_easyocr=False).strip()
if not success:
return ""

Expand Down Expand Up @@ -149,6 +162,8 @@ def query(

with contextlib.redirect_stdout(open(os.devnull, "w")):
enc_image = self.model.encode_image(img)
answer = self.model.answer_question(enc_image, query, self.tokenizer)
answer = self.model.answer_question(
enc_image, query, self.tokenizer, max_length=400
)

return answer
21 changes: 19 additions & 2 deletions interpreter/core/llm/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(self, interpreter):
self.api_base = None
self.api_key = None
self.api_version = None
self._is_loaded = False

# Budget manager powered by LiteLLM
self.max_budget = None
Expand Down Expand Up @@ -143,7 +144,7 @@ def run(self, messages):
img_msg["content"] = (
precursor
+ image_description
+ "\n---\nThe image contains the following text exactly, which may or may not be relevant (if it's not relevant, ignore this): '''\n"
+ "\n---\nI've OCR'd the image, this is the result (this may or may not be relevant. If it's not relevant, ignore this): '''\n"
+ ocr
+ "\n'''"
+ postcursor
Expand Down Expand Up @@ -273,7 +274,20 @@ def run(self, messages):
else:
yield from run_text_llm(self, params)

# If you change model, set _is_loaded to false
@property
def model(self):
return self._model

@model.setter
def model(self, value):
self._model = value
self._is_loaded = False

def load(self):
if self._is_loaded:
return

if self.model.startswith("ollama/"):
# WOAH we should also hit up ollama and set max_tokens and context_window based on the LLM. I think they let u do that

Expand Down Expand Up @@ -302,7 +316,7 @@ def load(self):
subprocess.run(["ollama", "pull", model_name], check=True)

# Send a ping, which will actually load the model
print(f"\nLoading {model_name}...\n")
# print(f"\nLoading {model_name}...\n")

old_max_tokens = self.max_tokens
self.max_tokens = 1
Expand All @@ -313,6 +327,9 @@ def load(self):

# Validate LLM should be moved here!!

self._is_loaded = True
return


def fixed_litellm_completions(**params):
"""
Expand Down
4 changes: 2 additions & 2 deletions interpreter/core/utils/system_debug_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,9 @@ def interpreter_info(interpreter):
for message in interpreter.messages:
message = message.copy()
try:
if len(message["content"]) > 600:
if len(message["content"]) > 5000:
message["content"] = (
message["content"][:300] + "..." + message["content"][-300:]
message["content"][:800] + "..." + message["content"][-800:]
)
except Exception as e:
print(str(e), "for message:", message)
Expand Down
10 changes: 8 additions & 2 deletions interpreter/terminal_interface/magic_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,10 @@ def handle_verbose(self, arguments=None):
print("\n\nCurrent messages:\n")
for message in self.messages:
message = message.copy()
if message["type"] == "image" and message.get("format") != "path":
if message["type"] == "image" and message.get("format") not in [
"path",
"description",
]:
message["content"] = (
message["content"][:30] + "..." + message["content"][-30:]
)
Expand All @@ -102,7 +105,10 @@ def handle_debug(self, arguments=None):
print("\n\nCurrent messages:\n")
for message in self.messages:
message = message.copy()
if message["type"] == "image" and message.get("format") != "path":
if message["type"] == "image" and message.get("format") not in [
"path",
"description",
]:
message["content"] = (
message["content"][:30] + "..." + message["content"][-30:]
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,14 @@
interpreter.offline = True
interpreter.os = True

# Vision setup
interpreter.computer.vision.load()

# Final message
interpreter.display_message(
"**Warning:** In this mode, Open Interpreter will not require approval before performing actions. Be ready to close your terminal."
)
interpreter.display_message(
"\n**Note:** Codestral is a relatively weak model, so OS mode is highly experimental. Try using a more powerful model for OS mode with `interpreter --os`."
)
interpreter.display_message(
"> Model set to `codestral`, experimental OS control enabled"
)
interpreter.display_message("> Experimental OS control enabled.")
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
User: The code you ran produced no output. Was this expected, or are we finished?
Assistant: No further action is required; the provided snippet opens Chrome.
You have access to ONE special function called `computer.vision.query(query="Describe this image.", path="image.jpg")`. This will ask a vision AI model the query, regarding the image at path. For example:
You have access to TWO special functions called `computer.vision.query(query="Describe this image.", path="image.jpg")` (asks a vision AI model the query, regarding the image at path) and `computer.vision.ocr(path="image.jpg")` (returns text in the image at path). For example:
User: Rename the images on my desktop to something more descriptive.
Assistant: Viewing and renaming images.
Expand Down Expand Up @@ -53,6 +53,25 @@
```
User: The code you ran produced no output. Was this expected, or are we finished?
Assistant: We are finished.
User: What text is in the image 'user.png' on my desktop?
Assistant: ```python
import os
import string
from pathlib import Path
# Get the user's home directory in a cross-platform way
home_dir = Path.home()
# Define the path to the image
image_path = desktop_dir / 'user.png'
# Get the text in the image
text_in_image = computer.vision.ocr(path=str(image_path))
text_in_image
```
User: The code you ran produced this output: "29294 is the username". What does this mean?
Assistant: The output means that the `user.png` image on your desktop contains the text "29294 is the username".
NEVER use placeholders. Always specify exact paths, and use cross-platform ways of determining the desktop, documents, etc. folders.
Expand All @@ -65,15 +84,16 @@

# LLM settings
interpreter.llm.model = "ollama/codestral"
interpreter.llm.load() # Loads Ollama models
interpreter.llm.supports_functions = False
interpreter.llm.execution_instructions = False
interpreter.llm.max_tokens = 1000
interpreter.llm.context_window = 7000
interpreter.llm.load() # Loads Ollama models

# Computer settings
interpreter.computer.import_computer_api = True
interpreter.computer.system_message = "" # The default will explain how to use the full Computer API, and append this to the system message. For local models, we want more control, so we set this to "". The system message will ONLY be what's above ^
interpreter.computer.vision.load() # Load vision models

# Misc settings
interpreter.auto_run = False
Expand Down
Loading

0 comments on commit 87cc2f3

Please sign in to comment.