Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Storage watchdog #1056

Merged
merged 15 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@
# browser_param.callstack_instrument = True
# Record DNS resolution
browser_param.dns_instrument = True
# Set this value as appropriate for the size of your temp directory
# if you are running out of space
browser_param.maximum_profile_size = 50 * (10**20) # 50 MB = 50 * 2^20 Bytes

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params.data_directory = Path("./datadir/")
Expand Down
21 changes: 11 additions & 10 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,39 +3,40 @@ channels:
- main
dependencies:
- beautifulsoup4=4.12.2
- black=23.7.0
- black=23.9.1
- click=8.1.7
- codecov=2.1.13
- dill=0.3.7
- dill=0.3.7
- easyprocess=1.1
- gcsfs=2023.9.0
- gcsfs=2023.9.1
- geckodriver=0.33.0
- ipython=8.15.0
- isort=5.12.0
- leveldb=1.23
- multiprocess=0.70.15
- mypy=1.5.1
- nodejs=20.6.0
- nodejs=20.7.0
- pandas=2.1.0
- pillow=10.0.0
- pillow=10.0.1
- pip=23.2.1
- pre-commit=3.4.0
- psutil=5.9.5
- pyarrow=13.0.0
- pytest-asyncio=0.21.1
- pytest-cov=4.1.0
- pytest=7.4.1
- pytest=7.4.2
- python=3.11.5
- pyvirtualdisplay=3.0
- recommonmark=0.7.1
- redis-py=5.0.0
- s3fs=2023.9.0
- s3fs=2023.9.1
- selenium=4.12.0
- sentry-sdk=1.30.0
- sentry-sdk=1.31.0
- sphinx-markdown-tables=0.0.17
- sphinx=7.2.5
- sphinx=7.2.6
- tabulate=0.9.0
- tblib=1.7.0
- tblib=2.0.0
- wget=1.20.3
- pip:
- dataclasses-json==0.6.0
Expand All @@ -44,6 +45,6 @@ dependencies:
- plyvel==1.5.0
- tranco==0.6
- types-pyyaml==6.0.12.11
- types-redis==4.6.0.5
- types-redis==4.6.0.6
- types-tabulate==0.9.0.3
name: openwpm
19 changes: 17 additions & 2 deletions openwpm/browser_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
kill_process_and_children,
parse_traceback_for_sentry,
)
from .utilities.storage_watchdog import profile_size_exceeds_max_size

pickling_support.install()

Expand All @@ -42,7 +43,7 @@


class BrowserManagerHandle:
"""The BrowserManagerHandle class is responsible for holding all of the
"""The BrowserManagerHandle class is responsible for holding all the
configuration and status information on BrowserManager process
it corresponds to. It also includes a set of methods for managing
the BrowserManager process and its child processes/threads.
Expand Down Expand Up @@ -501,6 +502,16 @@ def execute_command_sequence(
if task_manager.closing:
return

# Allow StorageWatchdog to utilize built-in browser reset functionality
# which results in a graceful restart of the browser instance
if self.browser_params.maximum_profile_size:
assert self.current_profile_path is not None

reset = profile_size_exceeds_max_size(
self.current_profile_path,
self.browser_params.maximum_profile_size,
)

if self.restart_required or reset:
success = self.restart_browser_manager(clear_profile=reset)
if not success:
Expand Down Expand Up @@ -564,7 +575,11 @@ def kill_browser_manager(self):
"type %s" % (self.browser_id, str(self.display_pid))
)
if self.display_port is not None: # xvfb display lock
lockfile = "/tmp/.X%s-lock" % self.display_port
# lockfile = "/tmp/.X%s-lock" % self.display_port
lockfile = os.path.join(
self.browser_params.tmp_profile_dir, f".X{self.display_port}-lock"
)

try:
os.remove(lockfile)
except OSError:
Expand Down
50 changes: 47 additions & 3 deletions openwpm/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os
import tempfile
from dataclasses import dataclass, field
from json import JSONEncoder
from pathlib import Path
Expand Down Expand Up @@ -99,6 +99,47 @@ class BrowserParams(DataClassJsonMixin):
profile_archive_dir: Optional[Path] = field(
default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path)
)

tmp_profile_dir: Path = field(
default=Path(tempfile.gettempdir()),
metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path),
)
"""
The tmp_profile_dir defaults to the OS's temporary file folder (typically /tmp) and is where the generated
browser profiles and residual files are stored.
"""

maximum_profile_size: Optional[int] = None
"""
The total amount of on disk space the generated
browser profiles and residual files are allowed to consume in bytes.
If this option is not set, no checks will be performed

Rationale
---------
This option can serve as a happy medium between killing a browser after each
crawl and allowing the application to still perform quickly.

Used as a way to save space
in a limited environment with minimal detriment to speed.

If the maximum_profile_size is exceeded after a CommandSequence
is completed, the browser will be shut down and a new one will
be created. **Even with this setting you may temporarily have
more disk usage than the sum of all maximum_profile_sizes**
However, this will also ensure that a CommandSequence is
allowed to complete without undue interruptions.

Sample values
-------------
* 1073741824: 1GB
* 20971520: 20MB - for testing purposes
* 52428800: 50MB
* 73400320: 70MB
* 104857600: 100MB - IDEAL for 10+ browsers

"""

recovery_tar: Optional[Path] = None
donottrack: bool = False
tracking_protection: bool = False
Expand Down Expand Up @@ -133,8 +174,11 @@ class ManagerParams(DataClassJsonMixin):
"""A watchdog that tries to ensure that no Firefox instance takes up too much memory.
It is mostly useful for long running cloud crawls"""
process_watchdog: bool = False
"""It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`) instances that haven't been spawned by OpenWPM. (GeckoDriver is used by
Selenium to control Firefox and Xvfb a "virtual display" so we simulate having graphics when running on a server)."""
"""It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`)
instances that haven't been spawned by OpenWPM. (GeckoDriver is used by
Selenium to control Firefox and Xvfb a "virtual display" so we simulate having graphics when running on a server).
"""

num_browsers: int = 1
_failure_limit: Optional[int] = None
"""The number of command failures the platform will tolerate before raising a
Expand Down
6 changes: 3 additions & 3 deletions openwpm/deploy_browsers/deploy_firefox.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ def deploy_firefox(

root_dir = os.path.dirname(__file__) # directory of this file

browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_"))
browser_profile_path = Path(
tempfile.mkdtemp(prefix="firefox_profile_", dir=browser_params.tmp_profile_dir)
)
status_queue.put(("STATUS", "Profile Created", browser_profile_path))

# Use Options instead of FirefoxProfile to set preferences since the
Expand Down Expand Up @@ -167,8 +169,6 @@ def deploy_firefox(
# Get browser process pid
if hasattr(driver, "service") and hasattr(driver.service, "process"):
pid = driver.service.process.pid
elif hasattr(driver, "binary") and hasattr(driver.options.binary, "process"):
pid = driver.options.binary.process.pid
else:
raise RuntimeError("Unable to identify Firefox process ID.")

Expand Down
23 changes: 20 additions & 3 deletions openwpm/task_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pickle
import threading
import time
from functools import reduce
from types import TracebackType
from typing import Any, Dict, List, Optional, Set, Type

Expand All @@ -29,6 +30,7 @@
)
from .utilities.multiprocess_utils import kill_process_and_children
from .utilities.platform_utils import get_configuration_string, get_version
from .utilities.storage_watchdog import StorageLogger

tblib.pickling_support.install()

Expand Down Expand Up @@ -79,8 +81,8 @@ def __init__(

manager_params.source_dump_path = manager_params.data_directory / "sources"

self.manager_params = manager_params
self.browser_params = browser_params
self.manager_params: ManagerParamsInternal = manager_params
self.browser_params: List[BrowserParamsInternal] = browser_params
self._logger_kwargs = logger_kwargs

# Create data directories if they do not exist
Expand Down Expand Up @@ -108,7 +110,7 @@ def __init__(
self.logging_server = MPLogger(
self.manager_params.log_path,
str(structured_storage_provider),
**self._logger_kwargs
**self._logger_kwargs,
)
self.manager_params.logger_address = self.logging_server.logger_address
self.logger = logging.getLogger("openwpm")
Expand All @@ -128,6 +130,20 @@ def __init__(
thread.name = "OpenWPM-watchdog"
thread.start()

# Start the StorageLogger if a maximum storage value has been specified for any browser
if reduce(
lambda x, y: x or y,
map(lambda p: p.maximum_profile_size is not None, self.browser_params),
False,
):
storage_logger = StorageLogger(
self.browser_params[0].tmp_profile_dir,
)

storage_logger.daemon = True
storage_logger.name = "OpenWPM-storage-logger"

storage_logger.start()
# Save crawl config information to database
openwpm_v, browser_v = get_version()
self.storage_controller_handle.save_configuration(
Expand Down Expand Up @@ -363,6 +379,7 @@ def _start_thread(
# Start command execution thread
args = (self, command_sequence)
thread = threading.Thread(target=browser.execute_command_sequence, args=args)
thread.name = f"BrowserManagerHandle-{browser.browser_id}"
browser.command_thread = thread
thread.daemon = True
thread.start()
Expand Down
123 changes: 123 additions & 0 deletions openwpm/utilities/storage_watchdog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import logging
import math
import os
import subprocess
import time
from pathlib import Path
from threading import Thread
from typing import Optional

# Nifty little function to prettyfi the output. Takes in a number of bytes and spits out the
# corresponding size in the largest unit it is able to convert to.


def convert_size(size_bytes: int) -> str:
if size_bytes == 0:
return "0B"
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
i: int = int(math.floor(math.log(size_bytes, 1024)))
p: float = math.pow(1024, i)
s: float = round(size_bytes / p, 2)
return "%s %s" % (s, size_name[i])


def total_folder_size(startup: bool = False, root_dir: str = "/tmp") -> str:
"""Generates a human-readable message about the current size of the directory

Args:
startup (bool, optional): Runs the function on the total supplied folder.
root_dir (str, optional): The root directory that will be recursively checked.
"""

running_total: int = 0
if not startup:
for file in os.listdir(root_dir):
if "firefox" in file or ".xpi" in file or "owpm" in file or "Temp" in file:
path = os.path.join(root_dir, file)
try:
running_total += int(
subprocess.check_output(["du", "-s", "-b", path])
.split()[0]
.decode("utf-8")
)
except:
pass
return f"Currently using: {convert_size(running_total)} of storage on disk..."

for file in os.listdir(root_dir):
path = os.path.join(root_dir, file)
try:
running_total += int(
subprocess.check_output(
["du", "-s", "-b", path], stderr=subprocess.DEVNULL
)
.split()[0]
.decode("utf-8")
)
except:
pass

return f"Readable files in {root_dir} folder take up {convert_size(running_total)} of storage on disk at start time..."


class StorageLogger(Thread):
"""Logs the total amount of storage used in the supplied_dir"""

def __init__(self, supplied_dir: Optional[Path] = None) -> None:
super().__init__()
self.dir_to_watch = supplied_dir

def run(self) -> None:
logger = logging.getLogger("openwpm")
# Checks if the default dirsize and directory to watch were configured.
# If they are still the default, it exits because
# it would essentially work identically to setting the "reset" flag in the command sequence
if self.dir_to_watch is None:
logger.info("No dir_to_watch specified. StorageLogger shutting down")
return

logger.info("Starting the StorageLogger...")
logger.info(total_folder_size(startup=True))
try:
while True:
time.sleep(300) # Give storage updates every 5 minutes
logger.info(total_folder_size())
except:
print("Error")


def profile_size_exceeds_max_size(
profile_path: Path,
max_dir_size: int,
) -> bool:
logger = logging.getLogger("openwpm")
# 1073741824: # 1GB
# 20971520: # 20MB - for testing purposes
# 52428800: # 50MB
# 73400320: # 70MB
# 104857600: 100MB - IDEAL for 10+ browsers

readable_max_dir_size = convert_size(max_dir_size)

dir_size = int(
subprocess.check_output(["du", "-s", "-b", profile_path])
.split()[0]
.decode("utf-8")
)
readable_dir_size = convert_size(dir_size)

if dir_size < max_dir_size:
logger.info(
f"Current browser profile directory {profile_path} size is less than {readable_max_dir_size}: {readable_dir_size}"
)
return False
else:
logger.info(
f"{profile_path}: Folder scheduled to be deleted and recover {readable_dir_size} of storage."
)
return True


if __name__ == "__main__":
print("---Testing the StorageWatchdog folder size function---")
print(total_folder_size(startup=True))
Loading
Loading