Skip to content

Commit

Permalink
Merge pull request #775 from U-C4N/main
Browse files Browse the repository at this point in the history
This commit focuses on optimizing the utility modules in the codebase…
  • Loading branch information
VinciGit00 authored Oct 30, 2024
2 parents deed355 + 09c9678 commit bb2373d
Show file tree
Hide file tree
Showing 19 changed files with 279 additions and 165 deletions.
87 changes: 87 additions & 0 deletions .well-known/funding-manifest-urls/funding.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
{
"version": "v1.0.0",
"entity": {
"type": "individual",
"role": "maintainer",
"name": "Marco Vinciguerra",
"email": "mvincig11@gmail.com",
"phone": "",
"description": "I'm dedicated to advancing web scraping and data extraction through AI-powered tools, focusing on making data access more accessible and ethical. My mission is to create solutions that uphold digital freedoms and support open internet principles.",
"webpageUrl": {
"url": "https://scrapegraphai.com",
}
},
"projects": [
{
"guid": "scrapegraph-core",
"name": "ScrapeGraphAI Core",
"description": "An AI-powered web scraping framework that intelligently extracts structured data from websites with automatic pattern recognition, adaptive scraping strategies, and built-in rate limiting. Recognized as a top 200 open-source AI project globally.",
"webpageUrl": {
"url": "https://scrapegraphai.com/projects/core",
},
"repositoryUrl": {
"url": "https://github.com/ScrapeGraphAI/Scrapegraph-ai",
},
"licenses": ["spdx:MIT"],
"tags": ["web-scraping", "ai", "data-extraction", "python", "machine-learning", "open-source", "llm"]
}
],
"funding": {
"channels": [
{
"guid": "mybank",
"type": "bank",
"address": "",
"description": "Will accept direct bank transfers. Please e-mail me for details."
},
{
"guid": "mypay",
"type": "payment-provider",
"address": "https://example.com/payme/@myid",
"description": "Pay with your debit/credit card through this gateway and set up recurring subscriptions."
}
],
"plans": [
{
"guid": "infrastructure",
"status": "active",
"name": "Infrastructure Support",
"description": "Help cover monthly cloud infrastructure costs, including API servers, model hosting, and data storage.",
"amount": 750,
"currency": "USD",
"frequency": "monthly",
"channels": ["mybank"]
},
{
"guid": "developer-compensation",
"status": "active",
"name": "Developer Compensation",
"description": "Provides financial support for developers working on maintenance, updates, and feature additions for the projects.",
"amount": 2500,
"currency": "USD",
"frequency": "monthly",
"channels": ["mybank"]
},
{
"guid": "community-backer",
"status": "active",
"name": "Community Backer",
"description": "Support our open-source efforts with any contribution amount. Every donation helps!",
"amount": 5,
"currency": "USD",
"frequency": "monthly",
"channels": ["mypay"]
}
],
"history": [
{
"year": 2024,
"income": 15000,
"expenses": 15000,
"taxes": 0,
"currency": "USD",
"description": "Experienced a temporary dip in donations, with improvements expected."
}
]
}
}
44 changes: 43 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,50 @@
## [1.27.0-beta.13](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.12...v1.27.0-beta.13) (2024-10-29)
## [1.27.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.7...v1.27.0) (2024-10-26)


### Features

* add conditional node structure to the smart_scraper_graph and implemented a structured way to check condition ([cacd9cd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cacd9cde004dace1a7dcc27981245632a78b95f3))
* add integration with scrape.do ([ae275ec](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ae275ec5e86c0bb8fdbeadc2e5f69816d1dea635))
* add model integration gpt4 ([51c55eb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/51c55eb3a2984ba60572edbcdea4c30620e18d76))
* implement ScrapeGraph class for only web scraping automation ([612c644](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/612c644623fa6f4fe77a64a5f1a6a4d6cd5f4254))
* Implement SmartScraperMultiParseMergeFirstGraph class that scrapes a list of URLs and merge the content first and finally generates answers to a given prompt. ([3e3e1b2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3e3e1b2f3ae8ed803d03b3b44b199e139baa68d4))
* refactoring of export functions ([0ea00c0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0ea00c078f2811f0d1b356bd84cafde80763c703))
* refactoring of get_probable_tags node ([f658092](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f658092dffb20ea111cc00950f617057482788f4))
* refactoring of ScrapeGraph to SmartScraperLiteGraph ([52b6bf5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/52b6bf5fb8c570aa8ef026916230c5d52996f887))



### Bug Fixes

* fix export function ([c8a000f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c8a000f1d943734a921b34e91498b2f29c8c9422))
* fix the example variable name ([69ff649](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/69ff6495564a5c670b89c0f802ebb1602f0e7cfa))
* remove variable "max_result" not being used in the code ([e76a68a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e76a68a782e5bce48d421cb620d0b7bffa412918))


### chore

* fix example ([9cd9a87](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9cd9a874f91bbbb2990444818e8ab2d0855cc361))


### Test

* Add scrape_graph test ([cdb3c11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cdb3c1100ee1117afedbc70437317acaf7c7c1d3))
* Add smart_scraper_multi_parse_merge_first_graph test ([464b8b0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/464b8b04ea0d51280849173d5eda92d4d4db8612))


### CI

* **release:** 1.26.6-beta.1 [skip ci] ([e0fc457](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e0fc457d1a850f3306d473fbde55dd800133b404))
* **release:** 1.27.0-beta.1 [skip ci] ([9266a36](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9266a36b2efdf7027470d59aa14b654d68f7cb51))
* **release:** 1.27.0-beta.10 [skip ci] ([eee131e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/eee131e959a36a4471f72610eefbc1764808b6be))
* **release:** 1.27.0-beta.2 [skip ci] ([d84d295](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d84d29538985ef8d04badfed547c6fdc73d7774d))
* **release:** 1.27.0-beta.3 [skip ci] ([f576afa](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f576afaf0c1dd6d1dbf79fd5e642f6dca9dbe862))
* **release:** 1.27.0-beta.4 [skip ci] ([3d6bbcd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3d6bbcdaa3828ff257adb22f2f7c1a46343de5b5))
* **release:** 1.27.0-beta.5 [skip ci] ([5002c71](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5002c713d5a76b2c2e4313f888d9768e3f3142e1))
* **release:** 1.27.0-beta.6 [skip ci] ([94b9836](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/94b9836ef6cd9c24bb8c04d7049d5477cc8ed807))
* **release:** 1.27.0-beta.7 [skip ci] ([407f1ce](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/407f1ce4eb22fb284ef0624dd3f7bf7ba432fa5c))
* **release:** 1.27.0-beta.8 [skip ci] ([4f1ed93](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4f1ed939e671e46bb546b6b605db87e87c0d66ee))
* **release:** 1.27.0-beta.9 [skip ci] ([fd57cc7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fd57cc7c126658960e33b7214c2cc656ea032d8f))
* **AbstractGraph:** manually select model tokens ([f79f399](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f79f399ee0d660f162e0cb96d9faba48ecdc88b2)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768)

## [1.27.0-beta.12](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.11...v1.27.0-beta.12) (2024-10-28)
Expand Down
7 changes: 1 addition & 6 deletions docs/source/getting_started/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ OpenAI models
graph_config = {
"llm": {
"api_key": openai_key,
"model": "openai/gpt-3.5-turbo",
"model": "openai/gpt-4o",
},
}
Expand Down Expand Up @@ -67,11 +67,6 @@ After that, you can run the following code, using only your machine resources br
"format": "json", # Ollama needs the format to be specified explicitly
"model_tokens": 2000, # depending on the model set context length
"base_url": "http://localhost:11434", # set ollama URL of the local host (YOU CAN CHANGE IT, if you have a different endpoint
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434", # set ollama URL
}
}
Expand Down
4 changes: 4 additions & 0 deletions docs/source/introduction/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,16 @@ OpenAI Models
- GPT-3.5 Turbo (16,385 tokens)
- GPT-4 (8,192 tokens)
- GPT-4 Turbo Preview (128,000 tokens)
- GPT-4o (128000 tokens)
- GTP-4o-mini (128000 tokens)

Azure OpenAI Models
-------------------
- GPT-3.5 Turbo (16,385 tokens)
- GPT-4 (8,192 tokens)
- GPT-4 Turbo Preview (128,000 tokens)
- GPT-4o (128000 tokens)
- GTP-4o-mini (128000 tokens)

Google AI Models
----------------
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[project]
name = "scrapegraphai"


version = "1.27.0b13"


Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""
__init__.py file for scrapegraphai folder
__init__.py file for scrapegraphai folder
"""
2 changes: 1 addition & 1 deletion scrapegraphai/builders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
__init__.py file for builders folder
This module contains the builders for constructing various components in the ScrapeGraphAI application.
"""

from .graph_builder import GraphBuilder
4 changes: 3 additions & 1 deletion scrapegraphai/docloaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
"""__init__.py file for docloaders folder"""
"""
This module handles document loading functionalities for the ScrapeGraphAI application.
"""

from .chromium import ChromiumLoader
from .browser_base import browser_base_fetch
Expand Down
4 changes: 2 additions & 2 deletions scrapegraphai/graphs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
__init__.py file for graphs folder
"""
This module defines the graph structures and related functionalities for the ScrapeGraphAI application.
"""

from .abstract_graph import AbstractGraph
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/graphs/document_scraper_graph.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
md_scraper module
This module implements the Document Scraper Graph for the ScrapeGraphAI application.
"""
from typing import Optional
import logging
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/graphs/omni_scraper_graph.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
OmniScraperGraph Module
This module implements the Omni Scraper Graph for the ScrapeGraphAI application.
"""
from typing import Optional
from pydantic import BaseModel
Expand Down
4 changes: 2 additions & 2 deletions scrapegraphai/helpers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
__init__.py for the helpers folder
"""
This module provides helper functions and utilities for the ScrapeGraphAI application.
"""
from .nodes_metadata import nodes_metadata
from .schemas import graph_schema
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
__init__.py file for models folder
This module contains the model definitions used in the ScrapeGraphAI application.
"""
from .openai_itt import OpenAIImageToText
from .openai_tts import OpenAITextToSpeech
Expand Down
4 changes: 2 additions & 2 deletions scrapegraphai/nodes/base_node.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
BaseNode Module
"""
This module defines the base node class for the ScrapeGraphAI application.
"""
import re
from abc import ABC, abstractmethod
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""""
"""
FetchNode Module
"""
import json
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/prompts/description_node_prompts.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
description node prompts
This module contains prompts for description nodes in the ScrapeGraphAI application.
"""

DESCRIPTION_NODE_PROMPT = """
Expand Down
19 changes: 12 additions & 7 deletions scrapegraphai/utils/cleanup_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,18 @@ def minify_html(html):
"""
minify_html function
"""
html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)

html = re.sub(r'>\s+<', '><', html)
html = re.sub(r'\s+>', '>', html)
html = re.sub(r'<\s+', '<', html)
html = re.sub(r'\s+', ' ', html)
html = re.sub(r'\s*=\s*', '=', html)
# Combine multiple regex operations into one for better performance
patterns = [
(r'<!--.*?-->', '', re.DOTALL),
(r'>\s+<', '><', 0),
(r'\s+>', '>', 0),
(r'<\s+', '<', 0),
(r'\s+', ' ', 0),
(r'\s*=\s*', '=', 0)
]

for pattern, repl, flags in patterns:
html = re.sub(pattern, repl, html, flags=flags)

return html.strip()

Expand Down
74 changes: 28 additions & 46 deletions scrapegraphai/utils/copy.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,56 +30,38 @@ def is_boto3_client(obj):

def safe_deepcopy(obj: Any) -> Any:
"""
Attempts to create a deep copy of the object using `copy.deepcopy`
whenever possible. If that fails, it falls back to custom deep copy
logic. If that also fails, it raises a `DeepCopyError`.
Safely create a deep copy of an object, handling special cases.
Args:
obj (Any): The object to be copied, which can be of any type.
obj: Object to copy
Returns:
Any: A deep copy of the object if possible; otherwise, a shallow
copy if deep copying fails; if neither is possible, the original
object is returned.
Deep copy of the object
Raises:
DeepCopyError: If the object cannot be deep-copied or shallow-copied.
DeepCopyError: If object cannot be deep copied
"""

try:

return copy.deepcopy(obj)
except (TypeError, AttributeError) as e:

# Handle special cases first
if obj is None or isinstance(obj, (str, int, float, bool)):
return obj

if isinstance(obj, (list, set)):
return type(obj)(safe_deepcopy(v) for v in obj)

if isinstance(obj, dict):
new_obj = {}

for k, v in obj.items():
new_obj[k] = safe_deepcopy(v)
return new_obj

elif isinstance(obj, list):
new_obj = []

for v in obj:
new_obj.append(safe_deepcopy(v))
return new_obj

elif isinstance(obj, tuple):
new_obj = tuple(safe_deepcopy(v) for v in obj)

return new_obj

elif isinstance(obj, frozenset):
new_obj = frozenset(safe_deepcopy(v) for v in obj)
return new_obj

elif is_boto3_client(obj):
return {k: safe_deepcopy(v) for k, v in obj.items()}

if isinstance(obj, tuple):
return tuple(safe_deepcopy(v) for v in obj)

if isinstance(obj, frozenset):
return frozenset(safe_deepcopy(v) for v in obj)

if is_boto3_client(obj):
return obj

else:
try:
return copy.copy(obj)
except (TypeError, AttributeError):
raise DeepCopyError(
f"Cannot deep copy the object of type {type(obj)}"
) from e

return copy.copy(obj)

except Exception as e:
raise DeepCopyError(f"Cannot deep copy object of type {type(obj)}") from e
Loading

0 comments on commit bb2373d

Please sign in to comment.