Merge pull request #775 from U-C4N/main

This commit focuses on optimizing the utility modules in the codebase…
ScrapeGraphAI · Oct 30, 2024 · bb2373d · bb2373d
2 parents deed355 + 09c9678
commit bb2373d
Show file tree

Hide file tree

Showing 19 changed files with 279 additions and 165 deletions.
diff --git a/.well-known/funding-manifest-urls/funding.json b/.well-known/funding-manifest-urls/funding.json
@@ -0,0 +1,87 @@
+{
+    "version": "v1.0.0",
+    "entity": {
+        "type": "individual",
+        "role": "maintainer",
+        "name": "Marco Vinciguerra",
+        "email": "mvincig11@gmail.com",
+        "phone": "",
+        "description": "I'm dedicated to advancing web scraping and data extraction through AI-powered tools, focusing on making data access more accessible and ethical. My mission is to create solutions that uphold digital freedoms and support open internet principles.",
+        "webpageUrl": {
+            "url": "https://scrapegraphai.com",
+        }
+    },
+    "projects": [
+        {
+            "guid": "scrapegraph-core",
+            "name": "ScrapeGraphAI Core",
+            "description": "An AI-powered web scraping framework that intelligently extracts structured data from websites with automatic pattern recognition, adaptive scraping strategies, and built-in rate limiting. Recognized as a top 200 open-source AI project globally.",
+            "webpageUrl": {
+                "url": "https://scrapegraphai.com/projects/core",
+            },
+            "repositoryUrl": {
+                "url": "https://github.com/ScrapeGraphAI/Scrapegraph-ai",
+            },
+            "licenses": ["spdx:MIT"],
+            "tags": ["web-scraping", "ai", "data-extraction", "python", "machine-learning", "open-source", "llm"]
+        }
+    ],
+    "funding": {
+        "channels": [
+            {
+                "guid": "mybank",
+                "type": "bank",
+                "address": "",
+                "description": "Will accept direct bank transfers. Please e-mail me for details."
+            },
+            {
+                "guid": "mypay",
+                "type": "payment-provider",
+                "address": "https://example.com/payme/@myid",
+                "description": "Pay with your debit/credit card through this gateway and set up recurring subscriptions."
+            }
+        ],
+        "plans": [
+            {
+                "guid": "infrastructure",
+                "status": "active",
+                "name": "Infrastructure Support",
+                "description": "Help cover monthly cloud infrastructure costs, including API servers, model hosting, and data storage.",
+                "amount": 750,
+                "currency": "USD",
+                "frequency": "monthly",
+                "channels": ["mybank"]
+            },
+            {
+                "guid": "developer-compensation",
+                "status": "active",
+                "name": "Developer Compensation",
+                "description": "Provides financial support for developers working on maintenance, updates, and feature additions for the projects.",
+                "amount": 2500,
+                "currency": "USD",
+                "frequency": "monthly",
+                "channels": ["mybank"]
+            },
+            {
+                "guid": "community-backer",
+                "status": "active",
+                "name": "Community Backer",
+                "description": "Support our open-source efforts with any contribution amount. Every donation helps!",
+                "amount": 5,
+                "currency": "USD",
+                "frequency": "monthly",
+                "channels": ["mypay"]
+            }
+        ],
+        "history": [
+            {
+                "year": 2024,
+                "income": 15000,
+                "expenses": 15000,
+                "taxes": 0,
+                "currency": "USD",
+                "description": "Experienced a temporary dip in donations, with improvements expected."
+            }
+        ]
+    }
+}
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,8 +1,50 @@
-## [1.27.0-beta.13](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.12...v1.27.0-beta.13) (2024-10-29)
+## [1.27.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.7...v1.27.0) (2024-10-26)
+
+
+### Features
+
+* add conditional node structure to the smart_scraper_graph and implemented a structured way to check condition ([cacd9cd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cacd9cde004dace1a7dcc27981245632a78b95f3))
+* add integration with scrape.do ([ae275ec](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ae275ec5e86c0bb8fdbeadc2e5f69816d1dea635))
+* add model integration gpt4 ([51c55eb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/51c55eb3a2984ba60572edbcdea4c30620e18d76))
+* implement ScrapeGraph class for only web scraping automation ([612c644](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/612c644623fa6f4fe77a64a5f1a6a4d6cd5f4254))
+* Implement SmartScraperMultiParseMergeFirstGraph class that scrapes a list of URLs and merge the content first and finally generates answers to a given prompt. ([3e3e1b2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3e3e1b2f3ae8ed803d03b3b44b199e139baa68d4))
+* refactoring of export functions ([0ea00c0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0ea00c078f2811f0d1b356bd84cafde80763c703))
+* refactoring of get_probable_tags node ([f658092](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f658092dffb20ea111cc00950f617057482788f4))
+* refactoring of ScrapeGraph to SmartScraperLiteGraph ([52b6bf5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/52b6bf5fb8c570aa8ef026916230c5d52996f887))
+
 
 
 ### Bug Fixes
 
+* fix export function ([c8a000f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c8a000f1d943734a921b34e91498b2f29c8c9422))
+* fix the example variable name ([69ff649](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/69ff6495564a5c670b89c0f802ebb1602f0e7cfa))
+* remove variable "max_result" not being used in the code ([e76a68a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e76a68a782e5bce48d421cb620d0b7bffa412918))
+
+
+### chore
+
+* fix example ([9cd9a87](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9cd9a874f91bbbb2990444818e8ab2d0855cc361))
+
+
+### Test
+
+* Add scrape_graph test ([cdb3c11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cdb3c1100ee1117afedbc70437317acaf7c7c1d3))
+* Add smart_scraper_multi_parse_merge_first_graph test ([464b8b0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/464b8b04ea0d51280849173d5eda92d4d4db8612))
+
+
+### CI
+
+* **release:** 1.26.6-beta.1 [skip ci] ([e0fc457](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e0fc457d1a850f3306d473fbde55dd800133b404))
+* **release:** 1.27.0-beta.1 [skip ci] ([9266a36](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9266a36b2efdf7027470d59aa14b654d68f7cb51))
+* **release:** 1.27.0-beta.10 [skip ci] ([eee131e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/eee131e959a36a4471f72610eefbc1764808b6be))
+* **release:** 1.27.0-beta.2 [skip ci] ([d84d295](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d84d29538985ef8d04badfed547c6fdc73d7774d))
+* **release:** 1.27.0-beta.3 [skip ci] ([f576afa](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f576afaf0c1dd6d1dbf79fd5e642f6dca9dbe862))
+* **release:** 1.27.0-beta.4 [skip ci] ([3d6bbcd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3d6bbcdaa3828ff257adb22f2f7c1a46343de5b5))
+* **release:** 1.27.0-beta.5 [skip ci] ([5002c71](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5002c713d5a76b2c2e4313f888d9768e3f3142e1))
+* **release:** 1.27.0-beta.6 [skip ci] ([94b9836](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/94b9836ef6cd9c24bb8c04d7049d5477cc8ed807))
+* **release:** 1.27.0-beta.7 [skip ci] ([407f1ce](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/407f1ce4eb22fb284ef0624dd3f7bf7ba432fa5c))
+* **release:** 1.27.0-beta.8 [skip ci] ([4f1ed93](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4f1ed939e671e46bb546b6b605db87e87c0d66ee))
+* **release:** 1.27.0-beta.9 [skip ci] ([fd57cc7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fd57cc7c126658960e33b7214c2cc656ea032d8f))
 * **AbstractGraph:** manually select model tokens ([f79f399](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f79f399ee0d660f162e0cb96d9faba48ecdc88b2)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768)
 
 ## [1.27.0-beta.12](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.11...v1.27.0-beta.12) (2024-10-28)

diff --git a/docs/source/getting_started/examples.rst b/docs/source/getting_started/examples.rst
@@ -22,7 +22,7 @@ OpenAI models
    graph_config = {
       "llm": {
          "api_key": openai_key,
-         "model": "openai/gpt-3.5-turbo",
+         "model": "openai/gpt-4o",
       },
    }
 
@@ -67,11 +67,6 @@ After that, you can run the following code, using only your machine resources br
          "format": "json",  # Ollama needs the format to be specified explicitly
          "model_tokens": 2000, #  depending on the model set context length
          "base_url": "http://localhost:11434",  # set ollama URL of the local host (YOU CAN CHANGE IT, if you have a different endpoint
-      },
-      "embeddings": {
-         "model": "ollama/nomic-embed-text",
-         "temperature": 0,
-         "base_url": "http://localhost:11434",  # set ollama URL
       }
    }
 

diff --git a/docs/source/introduction/overview.rst b/docs/source/introduction/overview.rst
@@ -32,12 +32,16 @@ OpenAI Models
 - GPT-3.5 Turbo (16,385 tokens)
 - GPT-4 (8,192 tokens)
 - GPT-4 Turbo Preview (128,000 tokens)
+- GPT-4o (128000 tokens)
+- GTP-4o-mini (128000 tokens)
 
 Azure OpenAI Models
 -------------------
 - GPT-3.5 Turbo (16,385 tokens)
 - GPT-4 (8,192 tokens)
 - GPT-4 Turbo Preview (128,000 tokens)
+- GPT-4o (128000 tokens)
+- GTP-4o-mini (128000 tokens)
 
 Google AI Models
 ----------------

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,7 @@
 [project]
 name = "scrapegraphai"
 
+
 version = "1.27.0b13"
 
 

diff --git a/scrapegraphai/__init__.py b/scrapegraphai/__init__.py
@@ -1,3 +1,3 @@
 """
-    __init__.py file for scrapegraphai folder
+__init__.py file for scrapegraphai folder
 """
diff --git a/scrapegraphai/builders/__init__.py b/scrapegraphai/builders/__init__.py
@@ -1,5 +1,5 @@
 """
-__init__.py file for builders folder
+This module contains the builders for constructing various components in the ScrapeGraphAI application.
 """
 
 from .graph_builder import GraphBuilder
diff --git a/scrapegraphai/docloaders/__init__.py b/scrapegraphai/docloaders/__init__.py
@@ -1,4 +1,6 @@
-"""__init__.py file for docloaders folder"""
+"""
+This module handles document loading functionalities for the ScrapeGraphAI application.
+"""
 
 from .chromium import ChromiumLoader
 from .browser_base import browser_base_fetch

diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py
@@ -1,5 +1,5 @@
-""" 
-__init__.py file for graphs folder
+"""
+This module defines the graph structures and related functionalities for the ScrapeGraphAI application.
 """
 
 from .abstract_graph import AbstractGraph

diff --git a/scrapegraphai/graphs/document_scraper_graph.py b/scrapegraphai/graphs/document_scraper_graph.py
@@ -1,5 +1,5 @@
 """
-md_scraper module
+This module implements the Document Scraper Graph for the ScrapeGraphAI application.
 """
 from typing import Optional
 import logging

diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py
@@ -1,5 +1,5 @@
 """
-OmniScraperGraph Module
+This module implements the Omni Scraper Graph for the ScrapeGraphAI application.
 """
 from typing import Optional
 from pydantic import BaseModel

diff --git a/scrapegraphai/helpers/__init__.py b/scrapegraphai/helpers/__init__.py
@@ -1,5 +1,5 @@
-""" 
-__init__.py for the helpers folder
+"""
+This module provides helper functions and utilities for the ScrapeGraphAI application.
 """
 from .nodes_metadata import nodes_metadata
 from .schemas import graph_schema

diff --git a/scrapegraphai/models/__init__.py b/scrapegraphai/models/__init__.py
@@ -1,5 +1,5 @@
 """
-    __init__.py file for models folder
+This module contains the model definitions used in the ScrapeGraphAI application.
 """
 from .openai_itt import OpenAIImageToText
 from .openai_tts import OpenAITextToSpeech

diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py
@@ -1,5 +1,5 @@
-""" 
-BaseNode Module
+"""
+This module defines the base node class for the ScrapeGraphAI application.
 """
 import re
 from abc import ABC, abstractmethod

diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
@@ -1,4 +1,4 @@
-""""
+"""
 FetchNode Module
 """
 import json

diff --git a/scrapegraphai/prompts/description_node_prompts.py b/scrapegraphai/prompts/description_node_prompts.py
@@ -1,5 +1,5 @@
 """
-description node prompts
+This module contains prompts for description nodes in the ScrapeGraphAI application.
 """
 
 DESCRIPTION_NODE_PROMPT = """

diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py
@@ -60,13 +60,18 @@ def minify_html(html):
     """
     minify_html function 
     """
-    html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
-
-    html = re.sub(r'>\s+<', '><', html)
-    html = re.sub(r'\s+>', '>', html)
-    html = re.sub(r'<\s+', '<', html)
-    html = re.sub(r'\s+', ' ', html)
-    html = re.sub(r'\s*=\s*', '=', html)
+    # Combine multiple regex operations into one for better performance
+    patterns = [
+        (r'<!--.*?-->', '', re.DOTALL),
+        (r'>\s+<', '><', 0),
+        (r'\s+>', '>', 0), 
+        (r'<\s+', '<', 0),
+        (r'\s+', ' ', 0),
+        (r'\s*=\s*', '=', 0)
+    ]
+
+    for pattern, repl, flags in patterns:
+        html = re.sub(pattern, repl, html, flags=flags)
 
     return html.strip()
 

diff --git a/scrapegraphai/utils/copy.py b/scrapegraphai/utils/copy.py
@@ -30,56 +30,38 @@ def is_boto3_client(obj):
 
 def safe_deepcopy(obj: Any) -> Any:
     """
-    Attempts to create a deep copy of the object using `copy.deepcopy`
-    whenever possible. If that fails, it falls back to custom deep copy
-    logic. If that also fails, it raises a `DeepCopyError`.
-
+    Safely create a deep copy of an object, handling special cases.
+    
     Args:
-        obj (Any): The object to be copied, which can be of any type.
-
+        obj: Object to copy
+        
     Returns:
-        Any: A deep copy of the object if possible; otherwise, a shallow
-        copy if deep copying fails; if neither is possible, the original
-        object is returned.
+        Deep copy of the object
+        
     Raises:
-        DeepCopyError: If the object cannot be deep-copied or shallow-copied.
+        DeepCopyError: If object cannot be deep copied
     """
-
     try:
-
-        return copy.deepcopy(obj)
-    except (TypeError, AttributeError) as e:
-
+        # Handle special cases first
+        if obj is None or isinstance(obj, (str, int, float, bool)):
+            return obj
+
+        if isinstance(obj, (list, set)):
+            return type(obj)(safe_deepcopy(v) for v in obj)
+
         if isinstance(obj, dict):
-            new_obj = {}
-
-            for k, v in obj.items():
-                new_obj[k] = safe_deepcopy(v)
-            return new_obj
-
-        elif isinstance(obj, list):
-            new_obj = []
-
-            for v in obj:
-                new_obj.append(safe_deepcopy(v))
-            return new_obj
-
-        elif isinstance(obj, tuple):
-            new_obj = tuple(safe_deepcopy(v) for v in obj)
-
-            return new_obj
-
-        elif isinstance(obj, frozenset):
-            new_obj = frozenset(safe_deepcopy(v) for v in obj)
-            return new_obj
-
-        elif is_boto3_client(obj):
+            return {k: safe_deepcopy(v) for k, v in obj.items()}
+
+        if isinstance(obj, tuple):
+            return tuple(safe_deepcopy(v) for v in obj)
+
+        if isinstance(obj, frozenset):
+            return frozenset(safe_deepcopy(v) for v in obj)
+
+        if is_boto3_client(obj):
             return obj
-
-        else:
-            try:
-                return copy.copy(obj)
-            except (TypeError, AttributeError):
-                raise DeepCopyError(
-                    f"Cannot deep copy the object of type {type(obj)}"
-                ) from e
+
+        return copy.copy(obj)
+
+    except Exception as e:
+        raise DeepCopyError(f"Cannot deep copy object of type {type(obj)}") from e