Skip to content

Commit

Permalink
refactor(storage-watchdog): adjust storage watchdog implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
vringar committed Oct 11, 2023
1 parent 213a4c9 commit a11420b
Show file tree
Hide file tree
Showing 6 changed files with 170 additions and 181 deletions.
11 changes: 6 additions & 5 deletions demo_watchdog.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
print("Loading tranco top sites list...")
t = tranco.Tranco(cache=True, cache_dir=".tranco")
latest_list = t.list()
sites = ["http://" + x for x in latest_list.top(10)]
sites = ["http://" + x for x in latest_list.top(100)]
else:
sites = [
"http://www.example.com",
Expand All @@ -28,7 +28,7 @@
"https://www.google.com",
"https://www.minecraft.net",
"https://www.nytimes.com",
"https://www.github.com"
"https://www.github.com",
]

# Loads the default ManagerParams
Expand All @@ -52,8 +52,9 @@
# a broken function
# Record DNS resolution
browser_param.dns_instrument = True
# Specify the location of temporary files. Ensure directory exists when specifying.
# Specify the location of temporary files. Ensure directory exists when specifying.
# browser_param.tmp_profile_dir = "/"
browser_param.maximum_profile_size = 52428800

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params.data_directory = Path("./datadir/")
Expand All @@ -64,7 +65,7 @@
# Please refer to docs/Configuration.md#platform-configuration-options for more information
# manager_params.memory_watchdog = True
# manager_params.process_watchdog = True
manager_params.storage_watchdog_enable = 52428800


# Commands time out by default after 60 seconds
with TaskManager(
Expand All @@ -74,7 +75,7 @@
None,
) as manager:
# Visits the sites

for index, site in enumerate(sites):

def callback(success: bool, val: str = site) -> None:
Expand Down
33 changes: 11 additions & 22 deletions openwpm/browser_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
kill_process_and_children,
parse_traceback_for_sentry,
)
from .utilities.storage_watchdog import StorageWatchdogThread
from .utilities.storage_watchdog import profile_size_exceeds_max_size

pickling_support.install()

Expand All @@ -43,7 +43,7 @@


class BrowserManagerHandle:
"""The BrowserManagerHandle class is responsible for holding all of the
"""The BrowserManagerHandle class is responsible for holding all the
configuration and status information on BrowserManager process
it corresponds to. It also includes a set of methods for managing
the BrowserManager process and its child processes/threads.
Expand Down Expand Up @@ -504,26 +504,13 @@ def execute_command_sequence(

# Allow StorageWatchdog to utilize built-in browser reset functionality
# which results in a graceful restart of the browser instance
if self.manager_params.storage_watchdog_enable:

# storage_checker = threading.Thread(target=self.manager_params.storage_watchdog_obj.periodic_check, args=([self.current_profile_path, self]))
# storage_checker.daemon = True
# storage_checker.name = f"OpenWPM-storage-checker-{self.browser_id}"
storage_checker = StorageWatchdogThread(self.manager_params.storage_watchdog_obj, [
self.current_profile_path,
self
])
storage_checker.daemon = True
storage_checker.name = ""
storage_checker.start()
storage_checker.join()

# storage_checker.start()
# storage_checker.join()

# reset = self.manager_params.storage_watchdog_obj.periodic_check(self.current_profile_path, self)
reset = storage_checker.ret_value
if self.browser_params.maximum_profile_size:
assert self.current_profile_path is not None

reset = profile_size_exceeds_max_size(
self.current_profile_path,
self.browser_params.maximum_profile_size,
)

if self.restart_required or reset:
success = self.restart_browser_manager(clear_profile=reset)
Expand Down Expand Up @@ -589,7 +576,9 @@ def kill_browser_manager(self):
)
if self.display_port is not None: # xvfb display lock
# lockfile = "/tmp/.X%s-lock" % self.display_port
lockfile = os.path.join(self.browser_params.tmp_profile_dir, f".X{self.display_port}-lock")
lockfile = os.path.join(
self.browser_params.tmp_profile_dir, f".X{self.display_port}-lock"
)

try:
os.remove(lockfile)
Expand Down
70 changes: 41 additions & 29 deletions openwpm/config.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import os
import tempfile
from dataclasses import dataclass, field
from json import JSONEncoder
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
import tempfile

from dataclasses_json import DataClassJsonMixin
from dataclasses_json import config as DCJConfig
Expand Down Expand Up @@ -100,14 +99,46 @@ class BrowserParams(DataClassJsonMixin):
profile_archive_dir: Optional[Path] = field(
default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path)
)

tmp_profile_dir: str = tempfile.gettempdir()

tmp_profile_dir: Path = field(
default=Path(tempfile.gettempdir()),
metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path),
)
"""
The tmp_profile_dir defaults to the OS's temporary file folder (typically /tmp) and is where the generated
browser profiles and residual files are stored.
"""



maximum_profile_size: Optional[int] = None
"""
The total amount of on disk space a profile is allowed to consume in bytes.
If this option is not set, no checks will be performed
Rationale
---------
This option can serve as a happy medium between killing a browser after each
crawl and allowing the application to still perform quickly.
Used as a way to save space
in a limited environment with minimal detriment to speed.
If the maximum_profile_size is exceeded after a CommandSequence
is completed, the browser will be shut down and a new one will
be created. **Even with this setting you may temporarily have
more disk usage than the sum of all maximum_profile_sizes**
However, this will also ensure that a CommandSequence is
allowed to complete without undue interruptions.
Sample values
-------------
* 1073741824: 1GB
* 20971520: 20MB - for testing purposes
* 52428800: 50MB
* 73400320: 70MB
* 104857600: 100MB - IDEAL for 10+ browsers
"""

recovery_tar: Optional[Path] = None
donottrack: bool = False
tracking_protection: bool = False
Expand Down Expand Up @@ -142,30 +173,11 @@ class ManagerParams(DataClassJsonMixin):
"""A watchdog that tries to ensure that no Firefox instance takes up too much memory.
It is mostly useful for long running cloud crawls"""
process_watchdog: bool = False


storage_watchdog_enable: Optional[int] = None
"""A watchdog that serves as a happy medium between killing a browser after each
crawl and allowing the application to still perform quickly. Used as a way to save space
in a limited environment with minimal detriment to speed. This Optional[int] should be the threshold
size of the folder in bytes.
```
# Sample values:
1073741824: 1GB
20971520: 20MB - for testing purposes
52428800: 50MB
73400320: 70MB
104857600: 100MB - IDEAL for 10+ browsers
```
"""It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`)
instances that haven't been spawned by OpenWPM. (GeckoDriver is used by
Selenium to control Firefox and Xvfb a "virtual display" so we simulate having graphics when running on a server).
"""

storage_watchdog_obj = None # DO NOT EDIT THIS LINE
"""Stores a handle to the actual watchdog object."""

"""- It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`) instances that haven't been spawned by OpenWPM. (GeckoDriver is used by
=======
"""It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`) instances that haven't been spawned by OpenWPM. (GeckoDriver is used by
Selenium to control Firefox and Xvfb a "virtual display" so we simulate having graphics when running on a server)."""

num_browsers: int = 1
_failure_limit: Optional[int] = None
"""The number of command failures the platform will tolerate before raising a
Expand Down
4 changes: 3 additions & 1 deletion openwpm/deploy_browsers/deploy_firefox.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ def deploy_firefox(

root_dir = os.path.dirname(__file__) # directory of this file

browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_", dir=browser_params.tmp_profile_dir))
browser_profile_path = Path(
tempfile.mkdtemp(prefix="firefox_profile_", dir=browser_params.tmp_profile_dir)
)
status_queue.put(("STATUS", "Profile Created", browser_profile_path))

# Use Options instead of FirefoxProfile to set preferences since the
Expand Down
33 changes: 18 additions & 15 deletions openwpm/task_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pickle
import threading
import time
from functools import reduce
from types import TracebackType
from typing import Any, Dict, List, Optional, Set, Type

Expand All @@ -29,7 +30,7 @@
)
from .utilities.multiprocess_utils import kill_process_and_children
from .utilities.platform_utils import get_configuration_string, get_version
from .utilities.storage_watchdog import StorageWatchdog
from .utilities.storage_watchdog import StorageLogger

tblib.pickling_support.install()

Expand Down Expand Up @@ -80,12 +81,10 @@ def __init__(

manager_params.source_dump_path = manager_params.data_directory / "sources"

self.manager_params = manager_params
self.browser_params = browser_params
self.manager_params: ManagerParamsInternal = manager_params
self.browser_params: List[BrowserParamsInternal] = browser_params
self._logger_kwargs = logger_kwargs



# Create data directories if they do not exist
if not os.path.exists(manager_params.screenshot_path):
os.makedirs(manager_params.screenshot_path)
Expand Down Expand Up @@ -131,16 +130,20 @@ def __init__(
thread.name = "OpenWPM-watchdog"
thread.start()

# Start the StorageWatchdog
if self.manager_params.storage_watchdog_enable:

storage_watchdog = StorageWatchdog(self.browser_params[0].tmp_profile_dir ,self.manager_params.storage_watchdog_enable)
self.manager_params.storage_watchdog_obj = storage_watchdog
storage_watchdog_thread = threading.Thread(target=storage_watchdog.run, args=())
storage_watchdog_thread.daemon = True
storage_watchdog_thread.name = "OpenWPM-storage-watchdog"

storage_watchdog_thread.start()
# Start the StorageLogger if a maximum storage value has been specified for any browser
if reduce(
lambda x, y: x or y,
map(lambda p: p.maximum_profile_size is not None, self.browser_params),
False,
):
storage_logger = StorageLogger(
self.browser_params[0].tmp_profile_dir,
)

storage_logger.daemon = True
storage_logger.name = "OpenWPM-storage-logger"

storage_logger.start()
# Save crawl config information to database
openwpm_v, browser_v = get_version()
self.storage_controller_handle.save_configuration(
Expand Down
Loading

0 comments on commit a11420b

Please sign in to comment.