Gemini bbox visualization tool

Prompts: https://gist.github.com/simonw/c9b55757c8959b2c84d0d9bddd020f4d Which used https://gist.github.com/simonw/40ff639e96d55a1df7ebfa7db1974b92 See also: simonw/llm#557 (comment)
simonw · Aug 26, 2024 · 33199b2 · 33199b2
1 parent 1390c40
commit 33199b2
Showing 1 changed file with 142 additions and 0 deletions.
diff --git a/gemini-bbox-tool.html b/gemini-bbox-tool.html
@@ -0,0 +1,142 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Gemini API Image Processor with Bounding Box Visualization</title>
+    <script type="module">
+        import { GoogleGenerativeAI } from "https://esm.run/@google/generative-ai";
+        import { marked } from "https://esm.run/marked";
+
+        function getApiKey() {
+          let apiKey = localStorage.getItem("GEMINI_API_KEY");
+          if (!apiKey) {
+            apiKey = prompt("Please enter your Gemini API key:");
+            if (apiKey) {
+              localStorage.setItem("GEMINI_API_KEY", apiKey);
+            }
+          }
+          return apiKey;
+        }
+
+        async function getGenerativeModel(params) {
+            const API_KEY = getApiKey();
+            const genAI = new GoogleGenerativeAI(API_KEY);
+            return genAI.getGenerativeModel(params);
+        }
+
+        async function fileToGenerativePart(file) {
+            return new Promise((resolve) => {
+                const reader = new FileReader();
+                reader.onloadend = () => resolve({
+                    inlineData: { 
+                        data: reader.result.split(",")[1],
+                        mimeType: file.type 
+                    }
+                });
+                reader.readAsDataURL(file);
+            });
+        }
+
+        async function processImageAndPrompt() {
+            const fileInput = document.getElementById('imageInput');
+            const promptInput = document.getElementById('promptInput');
+            const resultDiv = document.getElementById('result');
+
+            if (!fileInput.files[0] || !promptInput.value) {
+                alert('Please select an image and enter a prompt.');
+                return;
+            }
+
+            resultDiv.innerHTML = 'Processing...';
+
+            try {
+                const model = await getGenerativeModel({ model: "gemini-1.5-pro" });
+                const imagePart = await fileToGenerativePart(fileInput.files[0]);
+
+                const result = await model.generateContent([promptInput.value, imagePart]);
+                const response = await result.response;
+                const text = response.text();
+
+                resultDiv.innerHTML = marked.parse(text);
+
+                // Extract coordinates from the response
+                const coordinates = extractCoordinates(text);
+                if (coordinates.length > 0) {
+                    displayImageWithBoundingBoxes(fileInput.files[0], coordinates);
+                }
+            } catch (error) {
+                resultDiv.innerHTML = `Error: ${error.message}`;
+            }
+        }
+
+        function extractCoordinates(text) {
+            const regex = /\[\s*\d+\s*,\s*\d+\s*,\s*\d+\s*,\s*\d+\s*\]/g;
+            const matches = text.match(regex) || [];
+            return matches.map(JSON.parse);
+        }
+
+        function displayImageWithBoundingBoxes(file, coordinates) {
+            const reader = new FileReader();
+            reader.onload = function(event) {
+                const image = new Image();
+                image.onload = function() {
+                    const canvas = document.getElementById('canvas');
+                    canvas.width = image.width;
+                    canvas.height = image.height;
+                    const ctx = canvas.getContext('2d');
+                    ctx.drawImage(image, 0, 0);
+
+                    const colors = ['#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#FF00FF', '#00FFFF'];
+                    coordinates.forEach((box, index) => {
+                        const [ymin, xmin, ymax, xmax] = box.map(coord => (1000 - coord) / 1000);
+                        const width = (xmax - xmin) * image.width;
+                        const height = (ymax - ymin) * image.height;
+
+                        ctx.strokeStyle = colors[index % colors.length];
+                        ctx.lineWidth = 2;
+                        ctx.strokeRect(xmin * image.width, ymin * image.height, width, height);
+                    });
+                };
+                image.src = event.target.result;
+            };
+            reader.readAsDataURL(file);
+        }
+
+        // Attach event listener to the submit button
+        document.getElementById('submitBtn').addEventListener('click', processImageAndPrompt);
+    </script>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            max-width: 800px;
+            margin: 0 auto;
+            padding: 20px;
+        }
+        textarea {
+            width: 100%;
+            height: 100px;
+        }
+        #result, #imageContainer {
+            margin-top: 20px;
+            border: 1px solid #ccc;
+            padding: 10px;
+        }
+        #canvas {
+            max-width: 100%;
+            height: auto;
+        }
+    </style>
+</head>
+<body>
+    <h1>Gemini API Image Processor with Bounding Box Visualization</h1>
+    <input type="file" id="imageInput" accept="image/*">
+    <textarea id="promptInput">Return bounding boxes as [ymin, xmin, ymax, xmax]
+</textarea>
+    <button id="submitBtn">Process</button>
+    <div id="result"></div>
+    <div id="imageContainer">
+        <canvas id="canvas"></canvas>
+    </div>
+</body>
+</html>