From 993dd3b22c971d8a696a0f4880651b5742b96600 Mon Sep 17 00:00:00 2001 From: Philipp Date: Thu, 11 Jan 2024 09:22:46 +0100 Subject: [PATCH] Feat/results by pairing (#33) * [clemgame] store results grouped by model pairing (instead of game): results->pair->game * [eval] swap order of game and model in results structure --------- Co-authored-by: briemadu (cherry picked from commit 29eae1291f42fce9085ac5f172fbba42223715e3) --- clemgame/clemgame.py | 148 ++++++++++++++++++++++++---------------- clemgame/file_utils.py | 39 ++++++++--- evaluation/evalutils.py | 2 +- scripts/cli.py | 4 +- 4 files changed, 119 insertions(+), 74 deletions(-) diff --git a/clemgame/clemgame.py b/clemgame/clemgame.py index 0cd78a8c38..e53436fb8c 100644 --- a/clemgame/clemgame.py +++ b/clemgame/clemgame.py @@ -122,14 +122,21 @@ def load_template(self, file_name: str) -> str: """ return file_utils.load_template(file_name, self.name) - def load_json(self, file_name: str, is_results_file: bool = False) -> Dict: + def load_json(self, file_name: str) -> Dict: """ Load a .json file from your game (or game results) directory :param file_name: can have subdirectories e.g. "sub/my_file" - :param is_results_file: if to look into results directory (instead of games) :return: the file contents """ - return file_utils.load_json(file_name, self.name, is_results_file) + return file_utils.load_json(file_name, self.name) + + def load_results_json(self, file_name: str, dialogue_pair: str) -> Dict: + """ + Load a .json file from your game (or game results) directory + :param file_name: can have subdirectories e.g. "sub/my_file" + :return: the file contents + """ + return file_utils.load_results_json(file_name, dialogue_pair, self.name) def load_csv(self, file_name: str) -> Dict: """ @@ -159,7 +166,7 @@ def store_file(self, data, file_name: str, sub_dir: str = None): fp = file_utils.store_game_file(data, file_name, self.name, sub_dir=sub_dir) self.logger.info("Game file stored to %s", fp) - def store_results_file(self, data, file_name: str, sub_dir: str = None): + def store_results_file(self, data, file_name: str, dialogue_pair: str, sub_dir: str = None): """ Store a results file in your game results' directory. The top-level directory is 'results'. @@ -167,11 +174,11 @@ def store_results_file(self, data, file_name: str, sub_dir: str = None): :param data: to store :param file_name: can have subdirectories e.g. "sub/my_file" """ - fp = file_utils.store_game_results_file(data, file_name, self.name, sub_dir=sub_dir) + fp = file_utils.store_game_results_file(data, file_name, dialogue_pair, self.name, sub_dir=sub_dir) self.logger.info("Results file stored to %s", fp) - def results_path(self): - return file_utils.results_dir(self.name) + def results_path_for(self, dialogue_pair: str): + return file_utils.game_results_dir_for(dialogue_pair, self.name) def applies_to(self, game_name: str) -> bool: return game_name == self.name @@ -195,8 +202,8 @@ def __init__(self, name: str): "episode scores": {}, } - def store_scores(self, game_record_dir): - self.store_results_file(self.scores, "scores.json", sub_dir=game_record_dir) + def store_scores(self, dialogue_pair, game_record_dir): + self.store_results_file(self.scores, "scores.json", dialogue_pair, sub_dir=game_record_dir) def log_next_turn(self): """ Call this method to group interactions per turn """ @@ -266,7 +273,7 @@ def log_episode_score(self, score_name, score_value): self.scores["episode scores"][score_name] = score_value self.logger.info(f"{self.name}: Logged episode score {score_name}={score_value}.") - def store_records(self, game_id, game_record_dir): + def store_records(self, dialogue_pair_desc: str, game_id: int, game_record_dir: str): """Raise warnings if a mandatory element is empty or format is wrong.""" if not self.interactions["players"]: self.logger.warning(f"Players metadada is missing!") @@ -281,8 +288,12 @@ def store_records(self, game_id, game_record_dir): self.logger.warning(f"Interaction logs are missing!") if not self.requests: self.logger.warning(f"No calls logged!") - self.store_results_file(self.interactions, "interactions.json", sub_dir=game_record_dir) - self.store_results_file(self.requests, "requests.json", sub_dir=game_record_dir) + self.store_results_file(self.interactions, "interactions.json", + dialogue_pair_desc, + sub_dir=game_record_dir) + self.store_results_file(self.requests, "requests.json", + dialogue_pair_desc, + sub_dir=game_record_dir) class GameMaster(GameRecorder): @@ -541,44 +552,49 @@ def setup(self): self.instances = self.load_json("in/instances.json") def build_transcripts(self): - game_result_path = os.path.join(self.results_path(), "records") - if not os.path.exists(game_result_path) or not os.path.isdir(game_result_path): - stdout_logger.info("No results directory found at: " + game_result_path) - return - dialogue_partners = [file for file in os.listdir(game_result_path) - if os.path.isdir(os.path.join(game_result_path, file))] + results_root = file_utils.results_root() + dialogue_partners = [file for file in os.listdir(results_root) + if os.path.isdir(os.path.join(results_root, file))] for dialogue_pair in dialogue_partners: + game_result_path = self.results_path_for(dialogue_pair) + if not os.path.exists(game_result_path) or not os.path.isdir(game_result_path): + stdout_logger.info("No results directory found at: " + game_result_path) + continue + model_pair = string_utils.to_model_pair(dialogue_pair) model_pair = ["-".join(m.split("-")[:-1]) for m in model_pair] # remove -t0.0 - experiments_path = os.path.join(game_result_path, dialogue_pair) - experiment_dirs = [file for file in os.listdir(experiments_path) - if os.path.isdir(os.path.join(experiments_path, file))] + + experiment_dirs = [file for file in os.listdir(game_result_path) + if os.path.isdir(os.path.join(game_result_path, file))] if not experiment_dirs: stdout_logger.warning(f"{self.name}: No experiments for {dialogue_pair}") for experiment_dir in experiment_dirs: - experiment_path = os.path.join(experiments_path, experiment_dir) + experiment_path = os.path.join(game_result_path, experiment_dir) experiment_name = "_".join(experiment_dir.split("_")[1:]) # remove leading index number if self.filter_experiment and experiment_name not in self.filter_experiment: stdout_logger.info(f"Skip experiment {experiment_name}") continue stdout_logger.info(f"Transcribe: {experiment_name}") - rel_experiment_path = f"records/{dialogue_pair}/{experiment_dir}" - experiment_config = self.load_json(f"{rel_experiment_path}/experiment_{experiment_name}", - is_results_file=True) + experiment_config = self.load_results_json(f"{experiment_dir}/experiment_{experiment_name}", + dialogue_pair) episode_dirs = [file for file in os.listdir(experiment_path) if os.path.isdir(os.path.join(experiment_path, file))] error_count = 0 for episode_dir in tqdm(episode_dirs, desc="Building transcripts"): try: - rel_episode_path = f"{rel_experiment_path}/{episode_dir}" - game_instance = self.load_json(f"{rel_episode_path}/instance", is_results_file=True) - game_interactions = self.load_json(f"{rel_episode_path}/interactions", is_results_file=True) + rel_episode_path = f"{experiment_dir}/{episode_dir}" + game_instance = self.load_results_json(f"{rel_episode_path}/instance", dialogue_pair) + game_interactions = self.load_results_json(f"{rel_episode_path}/interactions", dialogue_pair) transcript = transcript_utils.build_transcript(game_interactions, experiment_config, game_instance, dialogue_pair) - self.store_results_file(transcript, "transcript.html", sub_dir=rel_episode_path) + self.store_results_file(transcript, "transcript.html", + dialogue_pair, + sub_dir=rel_episode_path) transcript_tex = transcript_utils.build_tex(game_interactions) - self.store_results_file(transcript_tex, "transcript.tex", sub_dir=rel_episode_path) + self.store_results_file(transcript_tex, "transcript.tex", + dialogue_pair, + sub_dir=rel_episode_path) except Exception: # continue with other episodes if something goes wrong self.logger.exception(f"{self.name}: Cannot transcribe {episode_dir} (but continue)") error_count += 1 @@ -587,43 +603,46 @@ def build_transcripts(self): f"{self.name}: '{error_count}' exceptions occurred: See clembench.log for details.") def compute_scores(self): - game_result_path = os.path.join(self.results_path(), "records") - if not os.path.exists(game_result_path) or not os.path.isdir(game_result_path): - stdout_logger.info("No results directory found at: " + game_result_path) - return - dialogue_partners = [file for file in os.listdir(game_result_path) - if os.path.isdir(os.path.join(game_result_path, file))] + results_root = file_utils.results_root() + dialogue_partners = [file for file in os.listdir(results_root) + if os.path.isdir(os.path.join(results_root, file))] for dialogue_pair in dialogue_partners: + game_result_path = self.results_path_for(dialogue_pair) + if not os.path.exists(game_result_path) or not os.path.isdir(game_result_path): + stdout_logger.info("No results directory found at: " + game_result_path) + continue + model_pair = string_utils.to_model_pair(dialogue_pair) model_pair = ["-".join(m.split("-")[:-1]) for m in model_pair] # remove -t0.0 - experiments_path = os.path.join(game_result_path, dialogue_pair) - experiment_dirs = [file for file in os.listdir(experiments_path) - if os.path.isdir(os.path.join(experiments_path, file))] + + experiment_dirs = [file for file in os.listdir(game_result_path) + if os.path.isdir(os.path.join(game_result_path, file))] if not experiment_dirs: stdout_logger.warning(f"{self.name}: No experiments for {dialogue_pair}") for experiment_dir in experiment_dirs: - experiment_path = os.path.join(experiments_path, experiment_dir) + experiment_path = os.path.join(game_result_path, experiment_dir) experiment_name = "_".join(experiment_dir.split("_")[1:]) # remove leading index number if self.filter_experiment and experiment_name not in self.filter_experiment: stdout_logger.info(f"Skip experiment {experiment_name}") continue stdout_logger.info(f"Scoring: {experiment_name}") - rel_experiment_path = f"records/{dialogue_pair}/{experiment_dir}" - experiment_config = self.load_json(f"{rel_experiment_path}/experiment_{experiment_name}", - is_results_file=True) + experiment_config = self.load_results_json(f"{experiment_dir}/experiment_{experiment_name}", + dialogue_pair) episode_dirs = [file for file in os.listdir(experiment_path) if os.path.isdir(os.path.join(experiment_path, file))] error_count = 0 for episode_dir in tqdm(episode_dirs, desc="Scoring episodes"): try: - rel_episode_path = f"{rel_experiment_path}/{episode_dir}" - game_instance = self.load_json(f"{rel_episode_path}/instance", is_results_file=True) - game_interactions = self.load_json(f"{rel_episode_path}/interactions", is_results_file=True) + rel_episode_path = f"{experiment_dir}/{episode_dir}" + game_instance = self.load_results_json(f"{rel_episode_path}/instance", + dialogue_pair) + game_interactions = self.load_results_json(f"{rel_episode_path}/interactions", + dialogue_pair) game_master = self.create_game_master(experiment_config, model_pair) game_master.setup(**game_instance) game_master.compute_scores(game_interactions) - game_master.store_scores(rel_episode_path) + game_master.store_scores(dialogue_pair, rel_episode_path) except Exception: # continue with other episodes if something goes wrong self.logger.exception(f"{self.name}: Cannot score {episode_dir} (but continue)") error_count += 1 @@ -647,13 +666,15 @@ def run(self, dialog_pair: str, temperature: float): } ] - The instances will be automatically stored in "records" with the following structure: - - records - - experiment_name - - experiment.json - - episode_id - - instance.json - - interaction.json + The instances will be automatically stored in "game-name" with the following structure: + - results + - pairing + - game-name + - experiment_name + - experiment.json + - episode_id + - instance.json + - interaction.json """ self.logger.warning(f"{self.name}: Detected 'temperature={temperature}'") # Setting this directly on the apis for now (not on the players) @@ -710,14 +731,16 @@ def run(self, dialog_pair: str, temperature: float): self.logger.info("Activity: %s Experiment: %s Partners: %s Episode: %d", self.name, experiment_name, dialogue_pair_desc, episode_counter) - experiment_record_dir = f"records/{dialogue_pair_desc}/{experiment_idx}_{experiment_name}" + experiment_record_dir = f"{experiment_idx}_{experiment_name}" experiment_config = {k: experiment[k] for k in experiment if k != 'game_instances'} # Add some important infos to track experiment_config["timestamp"] = datetime.now().isoformat() experiment_config["dialogue_partners"] = dialogue_pair - self.store_results_file(experiment_config, f"experiment_{experiment_name}.json", + self.store_results_file(experiment_config, + f"experiment_{experiment_name}.json", + dialogue_pair_desc, sub_dir=experiment_record_dir) error_count = 0 @@ -727,13 +750,16 @@ def run(self, dialog_pair: str, temperature: float): game_id = game_instance["game_id"] self.logger.info("Activity: %s Experiment: %s Episode: %d Game: %s", self.name, experiment_name, episode_counter, game_id) - game_record_dir = experiment_record_dir + f"/episode_{episode_counter}" - self.store_results_file(game_instance, f"instance.json", sub_dir=game_record_dir) + episode_dir = experiment_record_dir + f"/episode_{episode_counter}" + self.store_results_file(game_instance, + f"instance.json", + dialogue_pair_desc, + sub_dir=episode_dir) try: game_master = self.create_game_master(experiment_config, dialogue_pair) game_master.setup(**game_instance) game_master.play() - game_master.store_records(game_id, game_record_dir) + game_master.store_records(dialogue_pair_desc, game_id, episode_dir) except Exception: # continue with other episodes if something goes wrong self.logger.exception(f"{self.name}: Exception for episode {game_id} (but continue)") error_count += 1 @@ -744,7 +770,9 @@ def run(self, dialog_pair: str, temperature: float): # Add experiment duration and overwrite file time_experiment_end = datetime.now() - time_experiment_start experiment_config["duration"] = str(time_experiment_end) - self.store_results_file(experiment_config, f"experiment_{experiment_name}.json", + self.store_results_file(experiment_config, + f"experiment_{experiment_name}.json", + dialogue_pair_desc, sub_dir=experiment_record_dir) def is_single_player(self) -> bool: diff --git a/clemgame/file_utils.py b/clemgame/file_utils.py index f6523d812e..a517133f78 100644 --- a/clemgame/file_utils.py +++ b/clemgame/file_utils.py @@ -12,12 +12,16 @@ def game_dir(game_name: str) -> str: return os.path.join(project_root(), "games", game_name) -def results_dir(game_name: str) -> str: - return os.path.join(project_root(), "results", game_name) +def results_root() -> str: + return os.path.join(project_root(), "results") -def load_json(file_name: str, game_name: str, is_results_file=False) -> Dict: - data = load_file(file_name, game_name, file_ending=".json", is_results_file=is_results_file) +def game_results_dir_for(dialogue_pair: str, game_name: str) -> str: + return os.path.join(results_root(), dialogue_pair, game_name) + + +def load_json(file_name: str, game_name: str) -> Dict: + data = load_file(file_name, game_name, file_ending=".json") data = json.loads(data) return data @@ -38,26 +42,39 @@ def load_template(file_name: str, game_name: str) -> str: return load_file(file_name, game_name, file_ending=".template") -def file_path(file_name: str, game_name: str = None, is_results_file=False) -> str: - if is_results_file: - return os.path.join(results_dir(game_name), file_name) +def file_path(file_name: str, game_name: str = None) -> str: if game_name: return os.path.join(game_dir(game_name), file_name) return os.path.join(project_root(), file_name) -def load_file(file_name: str, game_name: str = None, file_ending: str = None, is_results_file=False) -> str: +def load_file(file_name: str, game_name: str = None, file_ending: str = None) -> str: + if file_ending and not file_name.endswith(file_ending): + file_name = file_name + file_ending + fp = file_path(file_name, game_name) + with open(fp, encoding='utf8') as f: + data = f.read() + return data + + +def load_results_json(file_name: str, dialogue_pair: str, game_name: str) -> Dict: + data = load_results_file(file_name, dialogue_pair, game_name, file_ending=".json") + data = json.loads(data) + return data + + +def load_results_file(file_name: str, dialogue_pair: str, game_name: str, file_ending: str = None) -> str: if file_ending and not file_name.endswith(file_ending): file_name = file_name + file_ending - fp = file_path(file_name, game_name, is_results_file) + fp = os.path.join(game_results_dir_for(dialogue_pair, game_name), file_name) with open(fp, encoding='utf8') as f: data = f.read() return data -def store_game_results_file(data, file_name: str, game_name: str, sub_dir: str = None, +def store_game_results_file(data, file_name: str, dialogue_pair: str, game_name: str, sub_dir: str = None, do_overwrite: bool = True) -> str: - return store_file(data, file_name, results_dir(game_name), sub_dir, do_overwrite) + return store_file(data, file_name, game_results_dir_for(dialogue_pair, game_name), sub_dir, do_overwrite) def store_game_file(data, file_name: str, game_name: str, sub_dir: str = None, do_overwrite: bool = True) -> str: diff --git a/evaluation/evalutils.py b/evaluation/evalutils.py index 299469ee59..dcc43f038f 100644 --- a/evaluation/evalutils.py +++ b/evaluation/evalutils.py @@ -112,7 +112,7 @@ def parse_directory_name(name: str) -> dict: """Extract information from the directory name structure.""" splits = str(name).split('/') - game, _, model, experiment, episode, _ = splits[-6], splits[-5], splits[-4], splits[-3], splits[-2], splits[-1] + model, game, experiment, episode, _ = splits[-5:] return {'game': game, 'model': model, 'experiment': experiment, diff --git a/scripts/cli.py b/scripts/cli.py index b3b362db79..299f214d73 100644 --- a/scripts/cli.py +++ b/scripts/cli.py @@ -15,10 +15,10 @@ $> python3 scripts/cli.py [-m "mock"] run privateshared To score all games: - $> python3 scripts/cli.py [-m "mock"] score all + $> python3 scripts/cli.py score all To score a specific game: - $> python3 scripts/cli.py [-m "mock"] score privateshared + $> python3 scripts/cli.py score privateshared To score all games: $> python3 scripts/cli.py transcribe all