-
Notifications
You must be signed in to change notification settings - Fork 30
/
main.py
117 lines (83 loc) · 3.69 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import threading
import logging.config
import logging
import time
import alive_progress
import scraper
alive_progress.config_handler.set_global(ctrl_c=False, dual_line=True, theme="classic", stats=False)
def main() -> None:
args = scraper.argparse_setup()
if args.clean_data:
scraper.clean_datapoints()
if args.visualize:
scraper.visualize_data(args.all, args.category, args.id, args.name, args.up_to_date, args.compare)
if args.reset:
scraper.reset(args.category, args.name, args.id, args.all)
if args.add:
scraper.add_products(args.category, args.url)
if args.activate:
scraper.update_products_is_active_with_product_codes(args.id, True)
if args.deactivate:
scraper.update_products_is_active_with_product_codes(args.id, False)
if args.search:
scraper.search(args.search)
if args.scrape:
if args.threads:
scrape_with_threads()
else:
scrape()
if args.latest_datapoint:
scraper.print_latest_datapoints(args.name, args.id, args.category)
if args.list_products:
scraper.print_all_products()
if args.delete:
scraper.delete(args.category, args.name, args.id, args.all)
def scrape() -> None:
print("Scraping...")
request_delay = scraper.Config.get_request_delay()
active_products = scraper.db.get_all_products(select_only_active=True)
products = scraper.Format.db_products_to_scrapers(active_products)
with alive_progress.alive_bar(len(products), title="Scraping") as bar:
# Scrape and save scraped data for each product (sequentially)
for product in products:
bar.text = f"-> {product.url}"
time.sleep(request_delay)
product.scrape_info()
scraper.add_product.add_new_datapoint_with_scraper(product)
bar()
def scrape_with_threads() -> None:
print("Scraping with threads...")
request_delay = scraper.Config.get_request_delay()
grouped_db_products = scraper.db.get_all_products_grouped_by_domains(select_only_active=True)
grouped_products: list[list[scraper.Scraper]] = []
for db_products in grouped_db_products:
products = scraper.Format.db_products_to_scrapers(db_products)
grouped_products.append(products)
grouped_scraper_threads: list[list[threading.Thread]] = []
# Create scraper threads and group by domain
for products in grouped_products:
scraper_threads = [threading.Thread(target=product.scrape_info) for product in products]
grouped_scraper_threads.append(scraper_threads)
products_flatten = [product for products in grouped_products for product in products]
with alive_progress.alive_bar(len(products_flatten), title="Scraping with threads") as progress_bar:
# Create master threads to manage scraper threads sequentially for each domain
master_threads = [
threading.Thread(target=scraper.start_threads_sequentially, args=[scraper_threads, request_delay, progress_bar])
for scraper_threads in grouped_scraper_threads
]
# Start all master threads
for master_thread in master_threads:
master_thread.start()
# Wait for all master threads to finish
for master_thread in master_threads:
master_thread.join()
# Save scraped data for each product (sequentially)
for product in products_flatten:
scraper.add_product.add_new_datapoint_with_scraper(product)
if __name__ == "__main__":
scraper.db.create_db_and_tables()
logging.config.fileConfig(
fname=scraper.Filemanager.logging_ini_path,
defaults={"logfilename": scraper.Filemanager.logfile_path},
)
main()