From 58181c6b4a72365c3ff1d8b31434abed211e92a0 Mon Sep 17 00:00:00 2001 From: Norman Danner Date: Mon, 10 Jun 2024 15:56:16 -0400 Subject: [PATCH] feat(browser_manager): enable full XPI cleanup Set TMPDIR in the environment to a unique directory for each browser instance and delete it when the browser quits. This is a workaround for an issue with geckodriver. When the OpenWPM extension is installed via `WebDriver.install_addon()`, geckodriver makes a copy of the XPI file in TMPDIR. However, geckodriver never deletes that file. So on a stateless crawl, you end up with one copy of the XPI file for each site visited. This workaround sets TMPDIR in the environment before creating the geckodriver service, and then deletes the directory after `driver.quit()` returns in `BrowserManager.run()`. We use this indirection because we don't have access to the name of the temporary file, and it doesn't seem safe to just delete XPI files in /tmp. --- openwpm/browser_manager.py | 55 +++++++++++++++++++++++ openwpm/config.py | 13 ++++++ openwpm/deploy_browsers/deploy_firefox.py | 9 ++++ 3 files changed, 77 insertions(+) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 2d929a611..f7c81b7dc 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -131,6 +131,33 @@ def launch_browser_manager(self) -> bool: crash_recovery = True + # Create a unique temporary directory that we can delete + # when we shut down. Note that this doesn't force anything to + # use `tmpdir`, it just makes it available. + if self.browser_params.tmpdir is not None: + self.logger.debug( + "BROWSER %i: leftover temp directory %s? Deleting it." + % (self.browser_id, self.browser_params.tmpdir) + ) + try: + shutil.rmtree(self.browser_params.tmpdir) + except Exception: + self.logger.debug( + "BROWSER %i: error deleting %s" + % ( + self.browser_id, + self.browser_params.tmpdir, + ), + exc_info=True, + ) + self.browser_params.tmpdir = Path( + tempfile.mkdtemp(prefix="openwpm_", dir=os.getenv("TMPDIR", default="/tmp")) + ) + self.logger.debug( + "BROWSER %i: Using temp dir %s" + % (self.browser_id, self.browser_params.tmpdir) + ) + self.logger.info("BROWSER %i: Launching browser..." % self.browser_id) self.is_fresh = not crash_recovery @@ -340,6 +367,34 @@ def close_browser_manager(self, force: bool = False) -> None: if not shutdown_complete: self.kill_browser_manager() + # Delete the temporary directory used by geckodriver. + if self.browser_params.tmpdir is not None: + try: + t1 = time.time() + self.logger.debug( + "BROWSER %i: deleting temp dir %s" + % (self.browser_id, self.browser_params.tmpdir) + ) + shutil.rmtree(self.browser_params.tmpdir) + self.logger.debug( + "BROWSER %i: completed deleting temp dir %s in %d seconds" + % ( + self.browser_id, + self.browser_params.tmpdir, + time.time() - t1, + ) + ) + self.browser_params.tmpdir = None + except Exception as e: + self.logger.warn( + "BROWSER %i: failed to delete temp dir %s" + % ( + self.browser_id, + self.browser_params.tmpdir, + ), + exc_info=True, + ) + def execute_command_sequence( self, # Quoting to break cyclic import, see https://stackoverflow.com/a/39757388 diff --git a/openwpm/config.py b/openwpm/config.py index 674d8dab6..e42824550 100644 --- a/openwpm/config.py +++ b/openwpm/config.py @@ -103,6 +103,7 @@ class BrowserParams(DataClassJsonMixin): default=Path(tempfile.gettempdir()), metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path), ) + """ The tmp_profile_dir defaults to the OS's temporary file folder (typically /tmp) and is where the generated browser profiles and residual files are stored. @@ -139,6 +140,18 @@ class BrowserParams(DataClassJsonMixin): """ + tmpdir: Optional[Path] = field( + default=None, + metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path), + ) + """ + The temporary directory used by `geckodriver`. This is configured in + `BrowserManager.run` and then deleted when the browser is finished. We do + this because it seems that `geckodriver` doesn't clean up its temporary + files (in particular, a copy of the extension XPI file), so we need to do + so ourselves. + """ + recovery_tar: Optional[Path] = None donottrack: bool = False tracking_protection: bool = False diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index c4797b794..b5c4a6e29 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -140,6 +140,14 @@ def deploy_firefox( # Launch the webdriver status_queue.put(("STATUS", "Launch Attempted", None)) + # Use browser_params.tmpdir as the temporary directory. This is so that + # geckodriver makes its copy of the extension XPI file in tmpdir, so + # we can delete it later and not have it left behind. I make a shallow + # copy of `os.environ` because I'm a little nervous about modifying the + # OpenWPM process' environment. + env = os.environ.copy() + env["TMPDIR"] = str(browser_params.tmpdir) + fo.binary_location = firefox_binary_path geckodriver_path = subprocess.check_output( "which geckodriver", encoding="utf-8", shell=True @@ -149,6 +157,7 @@ def deploy_firefox( service=Service( executable_path=geckodriver_path, log_output=open(webdriver_interceptor.fifo, "w"), + env=env, ), )