evaluate_story.py

# third-party imports
from transformers import AutoProcessor, AutoModel, ViTModel
import os
import torch
import hpsv2
from openai import OpenAI
from torchvision import transforms
from facenet_pytorch import MTCNN, InceptionResnetV1
from sklearn.metrics.pairwise import cosine_similarity
import json
from dotenv import load_dotenv, find_dotenv
import re
from PIL import Image
import shutil
import random
import math
import sys
import numpy as np
from torch.nn import functional as F

sys.path.insert(0, os.getcwd())
sys.path.insert(0, os.path.join(os.getcwd(), 'tifa'))
from tifa.tifascore.question_gen import get_question_and_answers
from tifa.tifascore.unifiedqa import UnifiedQAModel
from tifa.tifascore.question_filter import filter_question_and_answers
from tifa.tifascore.tifa_score import tifa_score_single
from tifa.tifascore import VQAModel

# local import
from utilities import query_gpt, encode_image

# read local .env file
_ = load_dotenv(find_dotenv())


def GPT4_score(characters, story_figures=None):
    """
    final overall GPT 4 evaluation for stories produced by ensemble and the ones produced by methods
    :param characters: List of characters
    :param story_figures: List of dictionaries contain stories information
    :return: None
    """

    def parse_LLM_response(response):
        if '```python' in response or '```json' in response:
            response = re.findall(r'```(?:json|python)\n(.*?)\n```',
                                  response, re.DOTALL)
            response = [eval(part) for part in response][0]

        elif '{' in response and '}' in response:
            response = eval(response)

        return response

    # give an GPT 4 score for prompts generated by GPT_4 and LLaMa 3 in turns
    GPT_4 = True

    for _ in range(2):
        # Load data generated by the LLM
        LLM = 'GPT_4' if GPT_4 else 'Llama_3'
        print(f'\n# Start final evaluation for {LLM} story #')
        tag = 'two_characters' if len(characters) == 2 else (
            'three_characters' if len(characters) == 3 else 'four_characters')
        JSON_name = f'prompt_{LLM}_{tag}_filled.json'
        json_file_path = os.path.join(os.getcwd(), JSON_name)
        with open(json_file_path, "r") as file:
            data = json.load(file)

        for story_key, story_value in story_figures[LLM].items():
            image_url_list = []

            for image_path in story_value['image_path']:
                # Getting the base64 string
                base64_image = encode_image(image_path)

                image_dict = {"type": "image_url",
                              "image_url": {
                                  "url": f"data:image/jpeg;base64,{base64_image}"
                              }
                              }
                image_url_list.append(image_dict)

            # STYLE CONSISTENCY EVALUATION
            # define LLM instructions
            system_message_style_consistency = (
                "To complete this task, please follow these detailed steps:"
                "\n\nStep 1: As a skilled digital art creator renowned for your discerning eye, label each input image sequentially as image_<number> and identify the visual styles described in each user-provided prompt text."
                "\n\nStep 2: Carefully evaluate the images and assess their alignment to the visual styles described in the user-provided prompt texts, focusing on the following aspects:"
                "\n  - Media Type"
                "\n  - Color Palette"
                "\n  - Tint"
                "\n  - Ambience"
                "\n  - Saturation"
                "\n  - Contrast"
                "\n  - Overall Feel (e.g., painterly, digital, cartoony, realistic)"
                "\n\nStep 3: Present your findings in a table, with each aspect listed as a row. Include comments explaining your assessment for each aspect."
                "\n\nStep 4: Critically assign a score to each aspect, reflecting both the consistency across images and alignment with the prompt-described styles. Use a scale from 1 (worst) to 10 (best). Additionally, compute and include an average score for overall style consistency."
            )

            system_message_style_consistency += (
                "\n\nThe output should be formatted as a Python dictionary as shown below:"
                "\n```python"
                "\n{"
                "\n    'media_type_consistency': <score>,"
                "\n    'media_type_alignment': <score>,"
                "\n    'color_palette_alignment': <score>,"
                "\n    'color_palette_consistency': <score>,"
                "\n    'tint_consistency': <score>,"
                "\n    'tint_alignment': <score>,"
                "\n    'saturation_consistency': <score>,"
                "\n    'saturation_alignment': <score>,"
                "\n    'contrast_consistency': <score>,"
                "\n    'contrast_alignment': <score>,"
                "\n    'overall_feel_consistency': <score>,"
                "\n    'overall_feel_alignment': <score>,"
                "\n    'overall_style_consistency': <average score of all previous scores>"
                "\n}"
                "\n```"
                "\nPlease respond by providing only the formatted Python dictionary, with keys representing the evaluation aspects and values indicating the corresponding scores."
            )

            prompts_text_collector = ' and '.join(
                [f'prompt {prompt_dict["prompt_number"]}, {prompt_dict["prompt_text"]}. ' for prompt_dict in data]
            )

            user_message_Style_consistency = (
                "Perform an style consistency on the input images, assuming they collectively present a coherent visual narrative. "
                f"The narrative is segmented into prompts, with each prompt corresponding to an individual image. The prompts provided are: {prompts_text_collector}"
            )

            # query to GPT-4
            style_consistency_score = query_gpt(
                messages=(system_message_style_consistency, user_message_Style_consistency),
                model="gpt-4o",
                temperature=0.2, top_p=0.1, max_tokens=4096, image_url_list=image_url_list)

            style_consistency_score = parse_LLM_response(style_consistency_score)
            story_figures[LLM][story_key]['style_consistency_score'] = style_consistency_score
            print(f"{story_key}'s style consistency score", style_consistency_score)

            # ENTITY CONSISTENCY EVALUATION
            # integrate the real images of characters
            for char in characters:
                image_path = char.random_photo

                # Getting the base64 string
                base64_image = encode_image(image_path)

                image_dict = {"type": "image_url",
                              "image_url": {
                                  "url": f"data:image/jpeg;base64,{base64_image}"
                              }
                              }
                image_url_list.append(image_dict)

            # define LLM instructions for entity consistency
            system_message_entity_consistency = (
                "To complete this task, please follow these detailed steps:"
                f"\n\nStep 1: As a skilled digital art creator, identify consistent subjects displayed in each user-provided prompt text."
                f"\n\nStep 2: Besides the last {len(characters)} user-provided images, label the remaining input images sequentially as image_<number>. Each image corresponds to one prompt from Step 1. These last {len(characters)} images represent real photos of characters: {' and '.join([char.unique_token for char in characters])} respectively. Determine if each character maintains consistent appearances (clothes, facial features) across all images featuring that character and if they match the character's real photo. Present your findings in a table, with each row representing a main character."
                "\n\nStep 3: Critically evaluate the consistency of characters' appearances and their alignment to prompts and real character images by assigning scores to each character, on a scale from 1 (worst) to 10 (best). Consider facial features and clothing consistencies. Evaluate all character instances in all generated images and critically assess their alignment with real character images and the prompts. Ignore images without a character when assigning scores."
            )

            system_message_entity_consistency += (
                "\n\nThe output should be formatted as a Python dictionary as shown below:"
                "\n```python"
                "\n{"
                "\n    \"character_represented\": <Critically evaluate whether each image is showing the characters referred to in the relevant prompt. Assign an average score for each character. Penalize images that depict only a subset of the referenced characters.>,"
                "\n    \"character_clothes_consistency\": <Critically assign a consistency score for EACH character's clothes and general appearance across all images unless it is requested by the corresponding prompt>,"
                "\n    \"character_clothes_alignment\": <Critically assign an alignment score for EACH character's clothes with what is described in the prompt>,"
                "\n    \"character_face_features_consistency\": <Critically assign a consistency score for EACH character's facial features. Ensure that all images which include characters display them with the same facial features.>,"
                "\n    \"character_face_alignment\": <Critically assign a score for each character evaluating whether images accurately depict characters mentioned in the relevant prompt as displayed in the character's real images.>,"
                "\n    \"character_entity_consistency\": <Assign a consistency score for EACH character's clothes and facial features across all images.>,"
                "\n    \"character_entity_alignment\": <Assign an alignment score for EACH character's clothes and facial features with the real character images and character representation in the prompts.>,"
                "\n    \"average_entity_consistency_score\": <Average score of all characters' consistency and alignment scores>,"
                "\n}"
                "\n```"
                "\nPlease respond by providing only the formatted Python dictionary, with keys representing the evaluation aspects and values indicating the corresponding scores. Do not add any details before or after."
            )

            user_message_entity_consistency = (
                "Perform an Entity Consistency Evaluation on the input images, assuming they collectively present a coherent visual narrative. "
                f"The narrative is segmented into prompts, with each prompt corresponding to an individual image. The prompts provided are: {prompts_text_collector}"
            )

            entity_consistency_score = query_gpt(
                (system_message_entity_consistency, user_message_entity_consistency),
                model="gpt-4o",  # "gpt-4-1106-vision-preview",
                temperature=0.2, top_p=0.1, max_tokens=4096, image_url_list=image_url_list)

            entity_consistency_score = parse_LLM_response(entity_consistency_score)
            story_figures[LLM][story_key]['entity_consistency_score'] = entity_consistency_score
            print(f"{story_key}'s entity consistency score", entity_consistency_score)
            print()

        # INTERMEDIATE RESULT: Convert the dictionary to JSON format
        json_data = json.dumps(story_figures, indent=5)

        # INTERMEDIATE RESULT: Write the JSON data to a file
        GPT4_score_path = os.path.join(os.getcwd(), f"GPT_4_score_{tag}.json")
        with open(GPT4_score_path, "w") as file:
            file.write(json_data)

        with open(GPT4_score_path, "r") as file:
            story_figures = json.load(file)

        key1 = 'overall_style_consistency'
        key2 = 'average_entity_consistency_score'

        # Define a function to compute the sum of the values of the specified keys
        def sorting_key(item):
            value = item[1]
            style_score = value.get('style_consistency_score', {}).get(key1, 0)
            entity_score = value.get('entity_consistency_score', {}).get(key2, 0)

            return style_score + entity_score

        # Sort the dictionary based on the sorting key
        sorted_dic = dict(sorted(story_figures[LLM].items(), key=sorting_key, reverse=True))
        sorted_keys = list(sorted_dic.keys())

        # delete previously generated results (if any)
        path_old = os.path.join(os.getcwd(), f'3_best_{tag}_stories', f'{LLM}')
        if os.path.exists(path_old):
            for item in os.listdir(path_old):
                item_path = os.path.join(path_old, item)  # Get the full path of the item
                # Check if the item is a file
                if os.path.isfile(item_path):
                    # If it's a file, delete it
                    os.remove(item_path)
                elif os.path.isdir(item_path):
                    # If it's a subdirectory, delete it recursively
                    shutil.rmtree(item_path)

        for count, top_story in enumerate(sorted_keys):
            story_figures[LLM][top_story]['gpt_4_final_rank'] = count + 1
            if count < 3:
                # copy images and move them to path

                path = os.path.join(os.getcwd(), f'3_best_{tag}_stories', f'{LLM}', f'{count + 1}_{top_story}')
                if not os.path.exists(path):
                    os.makedirs(path)

                image_path = story_figures[LLM][top_story]['image_path']

                for image in image_path:
                    image_name = image.split('/')[-1]
                    destination_file = os.path.join(path, image_name)
                    shutil.copy(image, destination_file)
        GPT_4 = False

    # Convert the dictionary to JSON format
    json_data = json.dumps(story_figures, indent=5)

    # Write the JSON data to a file
    GPT4_score_path = os.path.join(os.getcwd(), f"GPT_4_score_{tag}.json")
    with open(GPT4_score_path, "w") as file:
        file.write(json_data)


def PickScore_metric(characters):
    """image-to-text alignment metric, enter multiple images and a prompt then this metric will pick the one that
     most aligned with human preference"""
    os.chdir(os.path.abspath(os.path.dirname(__file__)))

    device = "cuda"
    processor_name_or_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
    model_pretrained_name_or_path = "yuvalkirstain/PickScore_v1"

    processor = AutoProcessor.from_pretrained(processor_name_or_path)
    model = AutoModel.from_pretrained(model_pretrained_name_or_path).eval().to(device)

    GPT_4 = True
    for _ in range(2):
        LLM = 'GPT_4' if GPT_4 else 'Llama_3'
        tag = 'two_characters' if len(characters) == 2 else (
            'three_characters' if len(characters) == 3 else 'four_characters')
        JSON_name = f'prompt_{LLM}_{tag}_filled.json'
        json_file_path = os.path.join(os.getcwd(), JSON_name)
        with open(json_file_path, "r") as file:
            data = json.load(file)

        new_data = []

        for prompt_dict in data:
            modified_prompt_list = prompt_dict.copy()
            images = [Image.open(value) for value in prompt_dict['generated_photo_path'].values() if value is not None]
            methods = [key for key, value in prompt_dict['generated_photo_path'].items() if value is not None]
            image_inputs = processor(
                images=images,
                padding=True,
                truncation=True,
                max_length=77,
                return_tensors="pt",
            ).to(device)

            text_inputs = processor(
                text=prompt_dict['prompt_text'],
                padding=True,
                truncation=True,
                max_length=77,
                return_tensors="pt",
            ).to(device)

            with torch.no_grad():
                # embed
                image_embs = model.get_image_features(**image_inputs)
                image_embs = image_embs / torch.norm(image_embs, dim=-1, keepdim=True)

                text_embs = model.get_text_features(**text_inputs)
                text_embs = text_embs / torch.norm(text_embs, dim=-1, keepdim=True)

                # score
                scores = model.logit_scale.exp() * (text_embs @ image_embs.T)[0]

                # get probabilities if you have multiple images to choose from
                pick_score = torch.softmax(scores, dim=-1)

            for method, pickscore in zip(methods, pick_score.cpu().tolist()):
                modified_prompt_list['scores'][method]['PickScore'] = pickscore

            new_data.append(modified_prompt_list)

        # Convert the template to JSON format
        template_json = json.dumps(new_data, indent=4)

        # Write the JSON data to a file
        with open(json_file_path, "w") as file:
            file.write(template_json)

        GPT_4 = False

    del processor, model


def human_preference_score_metric(characters):
    """image-to-text alignment metric, this metric will pick the image that
     most aligned with human preference
    """

    GPT_4 = True
    for _ in range(2):
        LLM = 'GPT_4' if GPT_4 else 'Llama_3'
        tag = 'two_characters' if len(characters) == 2 else (
            'three_characters' if len(characters) == 3 else 'four_characters')
        JSON_name = f'prompt_{LLM}_{tag}_filled.json'
        json_file_path = os.path.join(os.getcwd(), JSON_name)
        with open(json_file_path, "r") as file:
            data = json.load(file)

        new_data = []

        for prompt_dict in data:
            modified_prompt_list = prompt_dict.copy()
            images = [Image.open(value) for value in prompt_dict['generated_photo_path'].values() if value is not None]
            methods = [key for key, value in prompt_dict['generated_photo_path'].items() if value is not None]
            # calculating the human preference score
            human_preference = hpsv2.score(images, prompt_dict['prompt_text'], hps_version="v2.1")

            # normalize human preference score before adding them
            min_hps = min(human_preference)
            max_hps = max(human_preference)

            normalized_human_preference = [(hps - min_hps) / (max_hps - min_hps) for hps in human_preference]

            for method, hps in zip(methods, normalized_human_preference):
                modified_prompt_list['scores'][method]['HPS'] = float(hps)

            new_data.append(modified_prompt_list)

        # Convert the template to JSON format
        template_json = json.dumps(new_data, indent=4)

        # Write the JSON data to a file
        with open(json_file_path, "w") as file:
            file.write(template_json)
        GPT_4 = False


def TIFA_metric_score(characters):
    """
    Accurate and Interpretable Text-to-Image Faithfulness Evaluation with Question Answering
    args:
    characters: list of characters
    """
    os.chdir(os.path.abspath(os.path.dirname(__file__)))

    OpenAI.api_key = os.environ['OPENAI_API_KEY']

    unifiedqa_model = UnifiedQAModel("allenai/unifiedqa-v2-t5-large-1363200")
    vqa_model = VQAModel("git-base")

    GPT_4 = True
    for _ in range(2):
        LLM = 'GPT_4' if GPT_4 else 'Llama_3'
        tag = 'two_characters' if len(characters) == 2 else (
            'three_characters' if len(characters) == 3 else 'four_characters')
        JSON_name = f'prompt_{LLM}_{tag}_filled.json'
        json_file_path = os.path.join(os.getcwd(), JSON_name)
        with open(json_file_path, "r") as file:
            data = json.load(file)

        # go through each prompt dictionary and update it with score
        new_data = []
        for prompt_dict in data:
            modified_prompt_list = prompt_dict.copy()
            gpt3_questions = get_question_and_answers(modified_prompt_list['prompt_text'])
            # filter gpt3_questions
            filtered_questions = filter_question_and_answers(unifiedqa_model, gpt3_questions)
            for method in [key for key, value in prompt_dict['generated_photo_path'].items() if value is not None]:
                try:
                    modified_prompt_list['scores'][method]['TIFA_metric_score'] = \
                        tifa_score_single(vqa_model, filtered_questions, prompt_dict['generated_photo_path'][method])[
                            'tifa_score']
                except Exception as e:
                    print(
                        f"Caught an exception: {e} with prompt number {prompt_dict['prompt_number']} for image generated by {method}")

                    continue

            new_data.append(modified_prompt_list)

        # Convert the template to JSON format
        template_json = json.dumps(new_data, indent=4)

        # Write the JSON data to a file
        with open(json_file_path, "w") as file:
            file.write(template_json)

        GPT_4 = False


def compile_5_new_stories(characters, algorithm='simple_genetic_algorithm', exclude_metrics=None):
    """
    Compile various stories and sort them by the weighted sum average of their images score.
    Next, get  5 new good compilations and store them locally
    :param characters: list of characters
    :param algorithm: algorithm using for sorting, choices are: simple_genetic_algorithm/ simulated_annealing
    :param exclude_metrics: in case you decide to exclude metrics from being considered when compiling stories
    :return: None
    """
    # Dictionary to store the top 5 stories for each LLM
    story_figures = {}
    GPT_4 = True

    metric_weighting = {
        'PickScore': 1.4,
        'HPS': 1,
        'TIFA_metric_score': 1,
        **{f'adaface_{i + 1}': 1 for i in range(len(characters))},
        **{f'FaceNet_{i + 1}': 1 for i in range(len(characters))},
        **{f'ViTS_16_DINO_embeddings_{i + 1}': 0 for i in range(len(characters))},
        **{f'inception_v3_{i + 1}': 0 for i in range(len(characters))},
    }
    for _ in range(2):
        # Retrieve data generated by each LLM
        LLM = 'GPT_4' if GPT_4 else 'Llama_3'
        tag = 'two_characters' if len(characters) == 2 else (
            'three_characters' if len(characters) == 3 else 'four_characters')
        JSON_name = f'prompt_{LLM}_{tag}_filled.json'
        json_file_path = os.path.join(os.getcwd(), JSON_name)
        with open(json_file_path, "r") as file:
            data = json.load(file)

        # Collect the paths to stories generated by each method
        story_method_collector = {method: [prompt['generated_photo_path'][method] for prompt in data]
                                  for method in data[0]['generated_photo_path'].keys()}

        prompt_include_one_character = [len(prompt['Characters_involved']) == 1 for prompt in data]
        prompt_include_two_character = [len(prompt['Characters_involved']) == 2 for prompt in data]
        prompt_include_no_character = [len(prompt['Characters_involved']) == 0 for prompt in data]

        if LLM not in story_figures:
            story_figures[LLM] = {}

        score_metrics = list(data[0]['scores']['dreambooth'].keys())
        # Initialize the score_dict
        score_dict = {i: {metric: [] for metric in score_metrics} for i in range(len(data))}

        # Collect and normalize scores
        for score_metric in score_metrics:
            for i, prompt in enumerate(data):
                for method, path in prompt['generated_photo_path'].items():
                    if path and score_metric in prompt['scores'][method].keys():
                        if (len(prompt['Characters_involved']) > 1 and
                                score_metric in [f'adaface_{i + 1}' for i in range(len(characters))] + [
                                    f'FaceNet_{i + 1}' for i in range(len(characters))]):
                            score_dict[i][score_metric].append(0)
                        else:
                            score_dict[i][score_metric].append(abs(prompt['scores'][method][score_metric]))

                # Prevent division by zero
                if sum(score_dict[i][score_metric]) != 0 and score_metric != 'PickScore':
                    # apply the weights of the metrics and normalize
                    score_dict[i][score_metric] = [
                        j / sum(score_dict[i][score_metric]) * metric_weighting[score_metric]
                        for j in score_dict[i][score_metric]
                    ]

                elif score_metric == 'PickScore':
                    score_dict[i][score_metric] = [
                        j * metric_weighting[score_metric]
                        for j in score_dict[i][score_metric]
                    ]

        # collect list of dictionaries where keys are the story title and value is the weighted average score
        list_of_dicts_key_method_value_avg_image_score = []
        for i, prompt in enumerate(data):
            list_of_dic = []
            for index, (method, path) in enumerate(prompt['generated_photo_path'].items()):

                if path:
                    scores = prompt['scores'][method]

                    # collect methods that generate photos for this prompt
                    methods = [key for key, value in prompt['generated_photo_path'].items() if
                               value is not None]

                    # exclude metrics as required, and add weighted values
                    filtered_values = [
                        score_dict[i][key][index] if key in score_dict[i] and key not in exclude_metrics else value
                        for key, value in scores.items()
                        if key not in exclude_metrics
                    ]

                    average_image_score = sum(filtered_values) / len(filtered_values) if filtered_values else 0
                    list_of_dic.append({path: {'average_image_score': average_image_score, **scores}})

                    if method not in story_figures[LLM]:
                        story_figures[LLM][method] = {'image_path': [], 'scores': []}
                    story_figures[LLM][method]['image_path'].append(path)
                    story_figures[LLM][method]['scores'].append({'average_image_score': average_image_score, **scores})
                    story_figures[LLM][method]['prompt_include_one_character'] = prompt_include_one_character
                    story_figures[LLM][method]['prompt_include_two_character'] = prompt_include_two_character
                    story_figures[LLM][method]['prompt_include_no_character'] = prompt_include_no_character

            list_of_dicts_key_method_value_avg_image_score.append(list_of_dic)
        if algorithm == 'simulated_annealing':
            # Function to calculate the total score of a given combination
            def calculate_total_score(combination):
                return sum(d[list(d.keys())[0]]['average_image_score'] for d in combination)

            # Function to perform simulated annealing for product combinations
            def simulated_annealing_product_combinations(list_of_dicts_key_method_value_avg_image_score,
                                                         initial_temperature=100, cooling_rate=0.97,
                                                         max_iterations=10000,
                                                         top_count=100):
                n = len(list_of_dicts_key_method_value_avg_image_score)
                # Initialize current combination randomly
                current_combination = [random.choice(lst) for lst in list_of_dicts_key_method_value_avg_image_score]
                current_score = calculate_total_score(current_combination)
                top_combinations = [(current_combination, current_score)]
                temperature = initial_temperature

                for _ in range(max_iterations):
                    # Generate new combination by modifying a random element
                    new_combination = current_combination[:]
                    idx = random.randint(0, n - 1)
                    new_combination[idx] = random.choice(list_of_dicts_key_method_value_avg_image_score[idx])
                    new_score = calculate_total_score(new_combination)
                    delta_score = new_score - current_score

                    # Decide whether to accept the new combination
                    if delta_score > 0 or math.exp(delta_score / temperature) > random.random():
                        current_combination = new_combination[:]
                        current_score = new_score
                        # Maintain the list of top combinations
                        if (len(top_combinations) < top_count and new_score > top_combinations[-1][1]
                                and not (current_combination, current_score) in top_combinations):
                            top_combinations.append((current_combination, current_score))
                            top_combinations.sort(key=lambda x: x[1], reverse=True)
                            top_combinations = top_combinations[:top_count]

                    temperature *= cooling_rate

                return [combination for combination, score in top_combinations[:top_count]]

            # Get the sorted combinations using simulated annealing
            sorted_combination = simulated_annealing_product_combinations(
                list_of_dicts_key_method_value_avg_image_score)

        elif algorithm == 'simple_genetic_algorithm':

            # Function to create the initial population
            def create_initial_population(population_size, list_of_dicts_key_method_value_avg_image_score):
                population = []
                for _ in range(population_size):
                    individual = [random.choice(lst) for lst in list_of_dicts_key_method_value_avg_image_score]
                    population.append(individual)
                return population

            # Function to calculate fitness of an individual
            def calculate_fitness(individual):
                return sum(d[list(d.keys())[0]]['average_image_score'] for d in individual)

            # Function to perform crossover between two parents
            def crossover(parent1, parent2):
                crossover_point = random.randint(1, len(parent1) - 1)
                child1 = parent1[:crossover_point] + parent2[crossover_point:]
                child2 = parent2[:crossover_point] + parent1[crossover_point:]
                return child1, child2

            # Function to mutate an individual
            def mutate(individual, mutation_rate):
                for i in range(len(individual)):
                    if random.random() < mutation_rate:
                        individual[i] = random.choice(list_of_dicts_key_method_value_avg_image_score[i])
                return individual

            # Function to perform the genetic algorithm for product combinations
            def genetic_algorithm_product_combinations(list_of_dicts_key_method_value_avg_image_score,
                                                       population_size=10000, mutation_rate=0.15, generations=1000,
                                                       top_count=100):
                population = create_initial_population(population_size, list_of_dicts_key_method_value_avg_image_score)
                top_combinations = [(calculate_fitness(individual), individual) for individual in population]
                top_combinations.sort(reverse=True)

                for _ in range(generations):
                    next_generation = []
                    while len(next_generation) < population_size:
                        # Select two parents and perform crossover
                        parent1, parent2 = random.choices(population[:population_size // 2], k=2)
                        child1, child2 = crossover(parent1, parent2)
                        # Mutate the children
                        child1 = mutate(child1, mutation_rate)
                        child2 = mutate(child2, mutation_rate)
                        next_generation.extend([child1, child2])
                    population = next_generation
                    top_combinations += [(calculate_fitness(individual), individual) for individual in population]
                    print(top_combinations)
                    top_combinations.sort(reverse=True)
                    top_combinations = top_combinations[:top_count]

                return [individual for _, individual in top_combinations]

            # Get the sorted combinations using genetic algorithm
            sorted_combination = genetic_algorithm_product_combinations(list_of_dicts_key_method_value_avg_image_score)

        count = 1
        rank = 1
        method_count = 1
        for tuple_item in sorted_combination:
            tuple_keys = [list(d.keys())[0] for d in tuple_item]

            if tuple_keys not in story_method_collector.values() and count < 6:
                story_figures[LLM][f'compilation_{count}'] = {'image_path': tuple_keys,
                                                              'scores': [list(d.values())[0] for d in tuple_item],
                                                              'compilation_rank': rank,
                                                              'prompt_include_one_character': prompt_include_one_character,
                                                              'prompt_include_no_character': prompt_include_no_character,
                                                              'prompt_include_two_character': prompt_include_two_character}
                count += 1

            if tuple_keys in story_method_collector.values() and rank <= 100:
                for key, value in story_method_collector.items():
                    if tuple_keys == value:
                        story_figures[LLM][key]['compilation_rank'] = rank
                        print(f'- {key} method is the top {rank} best story compilation for prompts generated by {LLM}')
                        if len(methods) == method_count:
                            break
                        method_count += 1
            elif rank > 100:
                break
            rank += 1
        print(f'Compilation is done for {LLM}.')
        GPT_4 = False
    return story_figures


def identity_preservation(generated_image_path, real_image_paths, eval_method):
    if eval_method == 'ViTS_16_DINO_embeddings_':
        # calculate the ViTS 16 DINO score for entity alignment (image-to-image alignment)
        return [ViTS_16_DINO_embeddings(generated_image_path, real_image_path) for real_image_path in real_image_paths]
    elif eval_method == 'FaceNet_':
        # calculate the FaceNet score for entity alignment (image-to-image alignment)
        return [FaceNet(generated_image_path, real_image_path) for real_image_path in real_image_paths]
    elif eval_method == 'inception_v3_':
        # calculate the inception_v3 score for entity alignment (image-to-image alignment)
        return [inception_v3_score(generated_image_path, real_image_path) for real_image_path in real_image_paths]


def adaface(characters):
    # Directory for pretrained model
    directory = os.path.join(os.getcwd(), 'pretrained', 'adaface_ir50_ms1mv2.ckpt')
    sys.path.insert(0, os.getcwd())
    sys.path.insert(0, os.path.join(os.getcwd(), 'AdaFace'))

    from AdaFace.face_alignment import align
    from AdaFace.inference import load_pretrained_model, to_input

    # Check if file has been downloaded
    while not os.path.exists(directory):
        print(
            f"File adaface_ir50_ms1mv2.ckpt does not exist in {os.path.join(os.getcwd(), 'pretrained')}. Please upload it.")
        input("Press Enter when the file has been uploaded...")
        print("Verifying file upload...")
    print(f"File adaface_ir50_ms1mv2.ckpt has been successfully uploaded to {os.path.join(os.getcwd(), 'pretrained')}.")

    # Give an AdaFace score for prompts generated by GPT-4 and LLaMa-3 in turns
    for GPT_4 in [True, False]:
        LLM = 'GPT_4' if GPT_4 else 'Llama_3'
        tag = 'two_characters' if len(characters) == 2 else (
            'three_characters' if len(characters) == 3 else 'four_characters')
        json_file_path = os.path.join(os.getcwd(), f'prompt_{LLM}_{tag}_filled.json')

        with open(json_file_path, "r") as file:
            data = json.load(file)

        # Process each prompt dictionary and update it with score
        new_data = []

        for prompt_dict in data:
            modified_prompt_list = prompt_dict.copy()
            for x, char in enumerate(modified_prompt_list['Characters_involved']):
                compared_images = [char['random_photo']] + [img for img in
                                                            modified_prompt_list['generated_photo_path'].values() if
                                                            img]

                # Load pretrained model
                model = load_pretrained_model('ir_50')

                # Calculate features of each image and get similarity scores
                features = []
                skipped_indices = []
                max_similarity_scores = []
                for count, path in enumerate(compared_images):
                    try:
                        # Detect faces and calculate its feature
                        aligned_faces = align.get_aligned_face(path)
                        if count == 0:
                            # Get feature for reference image
                            if len(aligned_faces) == 1:
                                bgr_input = to_input(aligned_faces[0])
                                feature, _ = model(bgr_input)
                                features.append(feature)
                            else:
                                raise IndexError
                        else:
                            # Get features for generated images
                            max_similarity = -2
                            best_feature = None
                            for face in aligned_faces:
                                bgr_input = to_input(face)
                                feature, _ = model(bgr_input)
                                similarity = torch.cat([features[0], feature]) @ torch.cat([features[0], feature]).T
                                if similarity.tolist()[0][1] > max_similarity:
                                    max_similarity = similarity.tolist()[0][1]
                                    best_feature = feature
                            if best_feature is not None:
                                max_similarity_scores.append(max_similarity)
                                features.append(best_feature)
                    except IndexError:
                        if count == 0:
                            # If no face is detected in the reference image, allocate zero score to all methods
                            max_similarity_scores = [0] * len(modified_prompt_list['generated_photo_path'])
                            print('*** No Faces have been recognized in the real image in this path: ', path)
                            break
                        skipped_indices.append(count)
                        continue

                # Handle skipped indices and methods
                methods = [key for key, value in modified_prompt_list['generated_photo_path'].items() if value]
                skipped_methods = [methods.pop(index - 1) for index in sorted(skipped_indices, reverse=True) if
                                   0 <= index <= len(methods)]

                # Allocate AdaFace scores
                for method, adaface_score in zip(methods, max_similarity_scores):
                    modified_prompt_list['scores'][method][f'adaface_{x + 1}'] = adaface_score

                # Allocate 0 score for skipped methods
                for method in skipped_methods:
                    modified_prompt_list['scores'][method][f'adaface_{x + 1}'] = 0

            new_data.append(modified_prompt_list)

        # Convert the modified data to JSON format and save it
        with open(json_file_path, "w") as file:
            json.dump(new_data, file, indent=4)


def ViTS_16_DINO_embeddings(reference_image_path, generated_image_path):
    generated_image = Image.open(generated_image_path)
    real_image = Image.open(reference_image_path)

    T = transforms.Compose([
        transforms.Resize(256, interpolation=3),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ])

    images = [
        T(image)
        for image in [generated_image, real_image]
    ]
    inputs = torch.stack(images)

    # Load DINO ViT-S/16
    model = ViTModel.from_pretrained('facebook/dino-vits16')

    # Get DINO features
    with torch.no_grad():
        outputs = model(inputs)

    last_hidden_states = outputs.last_hidden_state  # ViT backbone features
    emb_img1, emb_img2 = last_hidden_states[0, 0], last_hidden_states[1, 0]  # Get cls token (0-th token) for each img
    metric = F.cosine_similarity(emb_img1, emb_img2, dim=0)

    return metric.item()


def FaceNet(reference_image_path, generated_image_path):
    # Initialize MTCNN and InceptionResnetV1
    mtcnn = MTCNN(keep_all=True, device='cuda')
    resnet = InceptionResnetV1(pretrained='vggface2').eval().to('cuda')

    # Load reference and generated images
    reference_image = Image.open(reference_image_path).convert('RGB')
    generated_image = Image.open(generated_image_path).convert('RGB')

    # Detect faces in reference and generated images
    ref_faces = mtcnn(reference_image)
    gen_faces = mtcnn(generated_image)

    # Handle cases where no faces are detected
    if ref_faces is None or gen_faces is None or len(ref_faces) == 0 or len(gen_faces) == 0:
        return 0  # Return a similarity score of 0

    # Extract embedding for the reference face
    ref_embedding = resnet(ref_faces[0].unsqueeze(0).to('cuda')).detach().cpu().numpy()

    # Calculate similarity for each generated face
    similarities = []
    for gen_face in gen_faces:
        gen_embedding = resnet(gen_face.unsqueeze(0).to('cuda')).detach().cpu().numpy()
        similarity = cosine_similarity(gen_embedding, ref_embedding)[0][0]
        similarities.append(similarity)

    # Take the minimum similarity score as the overall identity preservation measure
    max_similarity = max(similarities)

    return max_similarity


def inception_v3_score(reference_image_path, generated_image_path):
    # Initialize MTCNN and InceptionResnetV1
    mtcnn = MTCNN(keep_all=True, device='cuda')
    inception_v3 = torch.hub.load('pytorch/vision:v0.10.0', 'inception_v3', weights='IMAGENET1K_V1').eval().to(
        'cuda')

    # Define image transformations
    preprocess = transforms.Compose([
        transforms.Resize((299, 299)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    # Load reference and generated images
    reference_image = Image.open(reference_image_path).convert('RGB')
    generated_image = Image.open(generated_image_path).convert('RGB')

    # Detect faces in reference and generated images
    ref_faces = mtcnn(reference_image)
    gen_faces = mtcnn(generated_image)

    # Handle cases where no faces are detected
    if ref_faces is None or len(ref_faces) == 0 or gen_faces is None or len(gen_faces) == 0:
        return 0  # Return a similarity score of 0

    # Convert tensors to PIL Images
    ref_faces = [transforms.ToPILImage()(face) for face in ref_faces]
    gen_faces = [transforms.ToPILImage()(face) for face in gen_faces]

    # Preprocess detected faces
    ref_faces = [preprocess(face) for face in ref_faces]
    gen_faces = [preprocess(face) for face in gen_faces]

    # Convert lists to tensors
    ref_faces = torch.stack(ref_faces).to('cuda')
    gen_faces = torch.stack(gen_faces).to('cuda')

    # Extract embedding for the reference face
    ref_embedding = inception_v3(ref_faces[0].unsqueeze(0)).detach().cpu().numpy()

    # Calculate similarity for each generated face
    similarities = []
    for gen_face in gen_faces:
        gen_embedding = inception_v3(gen_face.unsqueeze(0)).detach().cpu().numpy()
        similarity = cosine_similarity(gen_embedding, ref_embedding)[0][0]
        similarities.append(similarity)

    # Take the maximum similarity score as the overall identity preservation measure
    max_similarity = max(similarities)

    return max_similarity


def main(characters):
    # evaluate all images generated
    print('############################################################')
    print('########## Evaluation #########')
    print('############################################################')
    print('PickScore calculation is initiated...')
    PickScore_metric(characters)
    print('PickScore is allocated')
    print('human preference score calculation is initiated..')
    human_preference_score_metric(characters)
    print('human preference Score is allocated')
    print('TIFA_ score calculation is initiated..')
    TIFA_metric_score(characters)
    print('TIFA_ is allocated..')
    print('adaface score calculation is initiated..')
    adaface(characters)
    print('adaface is allocated')
    print('Now start constructing 5 compilation of images sequence with the highest scores..')
    # pick the top 5 ensemble stories
    story_figures = compile_5_new_stories(characters, exclude_metrics=[
        f'{eval_method}{i + 1}' for i in range(len(characters)) for eval_method in
        ['ViTS_16_DINO_embeddings_', 'inception_v3_']])
    print('done')
    print('############################################################')
    print('########## GPT 4 final evaluation #########')
    print('############################################################')
    # Final evaluation
    GPT4_score(characters, story_figures)


if __name__ == '__main__':
    photos_folder_1 = os.path.join(os.getcwd(), 'photos', 'Rizeh')
    photos_folder_2 = os.path.join(os.getcwd(), 'photos', 'Basel')
    photos_folder_3 = os.path.join(os.getcwd(), 'photos', 'Mamasalme')
    photos_folder_4 = os.path.join(os.getcwd(), 'photos', 'Keki')
    from character import CHARACTER

    character_1 = CHARACTER(
        photos_folder_1,
        gender='F',
        name="Rizeh",
        traits={
            'positive traits': ['Dutiful', 'Honest'],
            'neutral traits': ['Irreverent', 'Undemanding'],
            'negative traits': ['Tense', 'Ignorant']},
        unique_token='znrz',
        random_photo=os.path.join(photos_folder_1, 'znrz (1).JPG')
    )
    character_2 = CHARACTER(
        photos_folder_2,
        gender='M',
        name="Basel",
        unique_token='nsnn'
    )