Skip to content

Commit

Permalink
feat(browser_manager): enable full XPI cleanup
Browse files Browse the repository at this point in the history
Set TMPDIR in the environment to a unique directory for each browser
instance and delete it when the browser quits.

This is a workaround for an issue with geckodriver.  When the OpenWPM
extension is installed via `WebDriver.install_addon()`, geckodriver
makes a copy of the XPI file in TMPDIR.  However, geckodriver never
deletes that file.  So on a stateless crawl, you end up with one copy of
the XPI file for each site visited.

This workaround sets TMPDIR in the environment before creating the
geckodriver service, and then deletes the directory after
`driver.quit()` returns in `BrowserManager.run()`.  We use this
indirection because we don't have access to the name of the temporary
file, and it doesn't seem safe to just delete XPI files in /tmp.
  • Loading branch information
ndanner-wesleyancs authored and vringar committed Aug 7, 2024
1 parent a03fc7c commit 58181c6
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 0 deletions.
55 changes: 55 additions & 0 deletions openwpm/browser_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,33 @@ def launch_browser_manager(self) -> bool:

crash_recovery = True

# Create a unique temporary directory that we can delete
# when we shut down. Note that this doesn't force anything to
# use `tmpdir`, it just makes it available.
if self.browser_params.tmpdir is not None:
self.logger.debug(
"BROWSER %i: leftover temp directory %s? Deleting it."
% (self.browser_id, self.browser_params.tmpdir)
)
try:
shutil.rmtree(self.browser_params.tmpdir)
except Exception:
self.logger.debug(
"BROWSER %i: error deleting %s"
% (
self.browser_id,
self.browser_params.tmpdir,
),
exc_info=True,
)
self.browser_params.tmpdir = Path(
tempfile.mkdtemp(prefix="openwpm_", dir=os.getenv("TMPDIR", default="/tmp"))
)
self.logger.debug(
"BROWSER %i: Using temp dir %s"
% (self.browser_id, self.browser_params.tmpdir)
)

self.logger.info("BROWSER %i: Launching browser..." % self.browser_id)
self.is_fresh = not crash_recovery

Expand Down Expand Up @@ -340,6 +367,34 @@ def close_browser_manager(self, force: bool = False) -> None:
if not shutdown_complete:
self.kill_browser_manager()

# Delete the temporary directory used by geckodriver.
if self.browser_params.tmpdir is not None:
try:
t1 = time.time()
self.logger.debug(
"BROWSER %i: deleting temp dir %s"
% (self.browser_id, self.browser_params.tmpdir)
)
shutil.rmtree(self.browser_params.tmpdir)
self.logger.debug(
"BROWSER %i: completed deleting temp dir %s in %d seconds"
% (
self.browser_id,
self.browser_params.tmpdir,
time.time() - t1,
)
)
self.browser_params.tmpdir = None
except Exception as e:
self.logger.warn(
"BROWSER %i: failed to delete temp dir %s"
% (
self.browser_id,
self.browser_params.tmpdir,
),
exc_info=True,
)

def execute_command_sequence(
self,
# Quoting to break cyclic import, see https://stackoverflow.com/a/39757388
Expand Down
13 changes: 13 additions & 0 deletions openwpm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ class BrowserParams(DataClassJsonMixin):
default=Path(tempfile.gettempdir()),
metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path),
)

"""
The tmp_profile_dir defaults to the OS's temporary file folder (typically /tmp) and is where the generated
browser profiles and residual files are stored.
Expand Down Expand Up @@ -139,6 +140,18 @@ class BrowserParams(DataClassJsonMixin):
"""

tmpdir: Optional[Path] = field(
default=None,
metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path),
)
"""
The temporary directory used by `geckodriver`. This is configured in
`BrowserManager.run` and then deleted when the browser is finished. We do
this because it seems that `geckodriver` doesn't clean up its temporary
files (in particular, a copy of the extension XPI file), so we need to do
so ourselves.
"""

recovery_tar: Optional[Path] = None
donottrack: bool = False
tracking_protection: bool = False
Expand Down
9 changes: 9 additions & 0 deletions openwpm/deploy_browsers/deploy_firefox.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,14 @@ def deploy_firefox(
# Launch the webdriver
status_queue.put(("STATUS", "Launch Attempted", None))

# Use browser_params.tmpdir as the temporary directory. This is so that
# geckodriver makes its copy of the extension XPI file in tmpdir, so
# we can delete it later and not have it left behind. I make a shallow
# copy of `os.environ` because I'm a little nervous about modifying the
# OpenWPM process' environment.
env = os.environ.copy()
env["TMPDIR"] = str(browser_params.tmpdir)

fo.binary_location = firefox_binary_path
geckodriver_path = subprocess.check_output(
"which geckodriver", encoding="utf-8", shell=True
Expand All @@ -149,6 +157,7 @@ def deploy_firefox(
service=Service(
executable_path=geckodriver_path,
log_output=open(webdriver_interceptor.fifo, "w"),
env=env,
),
)

Expand Down

0 comments on commit 58181c6

Please sign in to comment.