src/preprocessing.py

# global
import string
from typing import List, Tuple

import numpy as np
import pandas as pd

import re
import nltk

from sklearn.utils import resample

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import RegexpTokenizer

import tensorflow as tf
from keras.layers import TextVectorization
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

# local
from utils import Doc2VecModel


punct = string.punctuation
stemmer = nltk.stem.PorterStemmer()
eng_stopwords = nltk.corpus.stopwords.words("english")


class Preprocessor:
    """Responsible for preprocessing case facts."""

    def __init__(self) -> None:
        pass

    def _nltk_tokenizer(self, text: str) -> List[str]:
        """
        Tokenize a given `text` using the RegexpTokenizer from the nltk library.

        Parameters:
        -----------
        - text : str
            A string containing the text to be tokenized.

        Returns:
        --------
        - tokens : List[str]
            A list of tokens generated by the tokenizer.
        """

        tokenizer = RegexpTokenizer(r"\w+")
        tokens = tokenizer.tokenize(text)

        return tokens

    def _tokenize_text(self, text_column: pd.Series) -> pd.Series:
        """Splitting `text_column` into tokens.

        Parameters:
        ------------
        - text_column : pd.Series
            Contains text that needs to be tokenized.

        Returns:
        --------
        - tokenized_text : pd.Series
            Contains tokenized version of `text_column`.
        """

        tokenized_text = text_column.apply(self._nltk_tokenizer)
        return tokenized_text

    def _convert_to_tagged_document(
        self, text_column: pd.Series
    ) -> Tuple[List[str], List[TaggedDocument]]:
        """
        Convert `text_column` of specific to TaggedDocuments.

        Parameters:
        ------------
        - column : pd.Series
            Contains the list of tokens of each fact.

        Returns:
        --------
        A tuble containing the following items:
            - tokens_list : list[str]
                Contains all tokens of each case in the `text_column`.
            - tagged_docs : list[TaggedDocument]
                Contains TaggedDocument object for each case.
        """

        tokens_list = text_column.to_list()
        tagged_docs = [TaggedDocument(t, [str(i)])
                       for i, t in enumerate(tokens_list)]

        return tokens_list, tagged_docs

    def _vectorize_text(
        self, doc2vec_model: Doc2Vec, df: pd.Series, tokens_list: List[str]
    ) -> pd.DataFrame:
        """
        Convert  values of `tokens_list` to a vector.

        Parameters:
        -----------
        - doc2vec_model : Doc2Vev
            Trained Doc2Vec model.
        - df : pd.Series
            This will use only to get its indicies for the new generated dataframe.
        - tokens_list : List[str]
            Contains all tokens of each case.

        Returns:
        --------
        - text_vectors_df : pd.DataFrame
            Contains the vector representaion for each case.
        """

        text_vectors = [doc2vec_model.infer_vector(doc) for doc in tokens_list]
        text_vectors_df = pd.DataFrame(text_vectors, index=df.index)

        return text_vectors_df

    def _anonymize_case_facts(
        self, first_party_name: str, second_party_name: str, facts: str
    ) -> str:
        """
        Anonymize case facts by replacing its party names with "_PARTY_" tag.

        Parameters:
        ------------
        - first_party_name : str
            Represents first party name or petitioner name.
        - second_party_name : str
            Represents second party name or respondent name.
        - facts : str
            Represents case facts.

        Returns:
        --------
        - anonymized_facts : str
            An anonymized version of `facts`.
        """

        # remove any commas and any non alphabet characters
        first_party_name = re.sub(r"[\,+]", " ", first_party_name)
        first_party_name = re.sub(r"[^a-zA-Z]", " ", first_party_name)

        second_party_name = re.sub(r"[\,+]", " ", second_party_name)
        second_party_name = re.sub(r"[^a-zA-Z]", " ", second_party_name)

        for name in first_party_name.split():
            facts = re.sub(name, " _PARTY_ ", facts)

        for name in second_party_name.split():
            facts = re.sub(name, " _PARTY_ ", facts)

        # replace any consecutive _PARTY_ tags with only one _PARTY_ tag.
        regex_continous_tags = r"(_PARTY_\s+){2,}"
        anonymized_facts = re.sub(regex_continous_tags, " _PARTY_ ", facts)
        # remove ant consecutive spaces
        anonymized_facts = re.sub(r"\s+", " ", anonymized_facts)

        return anonymized_facts

    def _preprocess_text(self, text: str) -> str:
        """
        Preprocessing & cleaning `text` including:
        - lowercasing
        - removing quotation marks
        - removing digits
        - removing punctuation
        - removing brackets, braces, and paranthesis
        - removeing stopwords
        - stemming tokens

        Parameters:
        ------------
        - text : str
            Text need to be processed (cleaned).

        Returns:
        --------
        - processed_text : str
            A preprocessed version of `text`.
        """

        text = text.lower()
        # remove quotation marks
        text = re.sub(r"\'", "", text)
        # remove digits
        text = re.sub(r"\d+", "", text)
        # remove punctuation but with keeping '_' letter
        text = "".join([ch for ch in text if (ch == "_") or (ch not in punct)])
        # remove brackets, braces, and parantheses
        text = re.sub(r"[\[\]\(\)\{\}]+", " ", text)
        tokens = nltk.word_tokenize(text)
        # remove stopwords and stemming tokens
        tokens = [stemmer.stem(token)
                  for token in tokens if token not in eng_stopwords]
        # convert tokens back to string
        processed_text = " ".join(tokens)

        return processed_text

    def convert_text_to_vectors_doc2vec(
        self,
        text_column: pd.Series,
        train: bool = True,
        embeddings_doc2vec: Doc2Vec = None,
    ) -> Tuple[Doc2Vec, pd.DataFrame] | pd.DataFrame:
        """
        Converting `text_column` to vectors using `Doc2Vec` model

        Parameters:
        ------------
        - text_column : pd.Series
            Contains the case facts.
        - train : bool, optional
            Defines whether the model will be trained or not. (if True, Doc2Vec will be trained |
            else, Doc2Vec will used the passed `embeddings_Doc2Vec`). (Default is True).
        - embeddings_doc2vec : Doc2Vec, optional
            Trained Doc2Vec model will be used for generating embeddings of `text_column` if
            `train` is False. (Default is None).

        Returns:
        --------
        1. A tuple contains the following:
            - embeddings_doc2vec : Doc2Vec
                Trained Doc2Vec model.
            - text_vectors_df : pd.DataFrame
                A DataFrame contains `text_column` vectors if `train` is True.

        2. text_vectors_df : pd.DataFrame
            A DataFrame contains `text_column` vectors if `train` is False.

        Raises:
        -------
        - AssertionError
            If train is False and `embeddings_doc2vec` is None.
        - AssertionError
            If train is False and `embedding_doc2vec` is not an instance of Doc2Vec
        """

        tokenized_text = self._tokenize_text(text_column)
        tokens_list, tagged_docs = self._convert_to_tagged_document(
            tokenized_text)

        if train:
            doc2vec_model = Doc2VecModel()
            embeddings_doc2vec = doc2vec_model.train_doc2vec_embeddings_model(
                tagged_docs
            )
            text_vectors_df = self._vectorize_text(
                embeddings_doc2vec, text_column, tokens_list
            )
            return embeddings_doc2vec, text_vectors_df

        assert (
            embeddings_doc2vec is not None
        ), "`embedding_doc2vec` argument must be not None."
        assert isinstance(
            embeddings_doc2vec, Doc2Vec
        ), "`embedding_doc2vec` argument must be an instance of Doc2Vec to infer vectors."
        text_vectors_df = self._vectorize_text(
            embeddings_doc2vec, text_column, tokens_list
        )

        return text_vectors_df

    def convert_text_to_vectors_tf_idf(
        self,
        text_column: pd.Series,
        ngrams: int = 2,
        max_tokens: int = 10000,
        output_mode: str = "tf-idf",
        train: bool = True,
        text_vectorizer: TextVectorization = None,
    ) -> Tuple[TextVectorization, tf.Tensor] | tf.Tensor:
        """
        Converting `text_column` to vectors using `TextVectorization` layer.

        Parameters:
        ------------
        - text_column : pd.Series
            Contains the case facts.
        - ngrams : int, optional
            Defines the number of n-gram (Default is 2).
        - max_tokens : int, optional
            Defines the number of max_tokens of `text_vectorizer` (Default is 10,000).
        - output_mode : str, optional
            Represents the output vectors type whether it is "tfi-df" or "binary" or "count"
            (Default is "tf-idf").
        - train : bool, optional
            Defines whether the model will be trained or not. (if True, TextVectorization
            will be trained, else, TextVectorization will used the passed `text_vectorizer`).
            (Default is True).
        - text_vectorizer : TextVectorization, optional
            Trained TextVectorization layer will be used for generating embeddings of
            `text_column` if `train` is False. (Default is None).

        Returns:
        --------
        - if `train` == True:
            A tuple contains the following:
                - text_vectorizer : TextVectorization
                    Trained TextVectorization layer.
                - text_vectors : tf.Tensor
                    A Tensor contains `text_column` training vectors.
        - otherwise:
            text_vectors : tf.Tensor
                A Tensor contains `text_column` testing vectors.

        Raises:
        -------
        - AssertionError
            If train is False and `text_vectorizer` is None.
        - AssertionError
            If train is False and `text_vectorizer` is not an instance of TextVectorization.
        """

        if train:
            text_vectorizer = TextVectorization(
                ngrams=ngrams, max_tokens=max_tokens, output_mode=output_mode
            )
            text_vectorizer.adapt(text_column)
            text_vectors = text_vectorizer(text_column)

            return text_vectorizer, text_vectors

        assert (
            text_vectorizer is not None
        ), "`text_vectorizer` argument must be not None."
        assert isinstance(
            text_vectorizer, TextVectorization
        ), "`text_vectorizer` argument must be an instance of TextVectorization to infer vectors."
        text_vectors = text_vectorizer(text_column)

        return text_vectors

    def convert_text_to_vectors_cnn(
        self,
        text_column: pd.Series,
        max_tokens: int = 2000,
        output_sequence_length: int = 500,
        output_mode: str = "int",
        train: bool = True,
        text_vectorizer: TextVectorization = None,
    ) -> Tuple[TextVectorization, tf.Tensor] | tf.Tensor:
        """
        Converting `text_column` to vectors using `TextVectorization` layer.

        Parameters:
        ------------
        - text_column : pd.Series
            Contains the case facts.
        - max_tokens : int, optional
            Defines the number of max_tokens of `text_vectorizer` (Default is 2000).
        - output_sequence_length : int, optional
            Represents the dimensions of the output vector (Default is 500).
        - output_mode : str, optional
            Represents the output vectors type whether it is "int" or "binary" or "tfi-df".
        - train : bool, optional
            Defines whether the model will be trained or not. (if True,
            TextVectorization will be trained | else, TextVectorization will used the
            passed `text_vectorizer`). (Default is True).
        - text_vectorizer : TextVectorization, optional
            Trained TextVectorization layer will be used for generating embeddings of
             `text_column` if `train` is False. (Default is None).

        Returns:
        --------
        - if `train` == True:
            A tuple contains the following:
                - text_vectorizer : TextVectorization
                    Trained TextVectorization layer.
                - text_vectors : tf.Tensor
                    A Tensor contains `text_column` training vectors.
        - otherwise:
            text_vectors : tf.Tensor
                A Tensor contains `text_column` testing vectors.

        Raises:
        -------
        - AssertionError
            If train is False and `text_vectorizer` is None.
        - AssertionError
            If train is False and `text_vectorizer` is not an instance of TextVectorization.
        """

        if train:
            text_vectorizer = TextVectorization(
                max_tokens=max_tokens,
                output_mode=output_mode,
                output_sequence_length=output_sequence_length,
            )
            text_vectorizer.adapt(text_column)
            text_vectors = text_vectorizer(text_column)
            return text_vectorizer, text_vectors

        assert (
            text_vectorizer is not None
        ), "`text_vectorizer` argument must be not None."
        assert isinstance(
            text_vectorizer, TextVectorization
        ), "`text_vectorizer` argument must be an instance of TextVectorization to infer vectors."
        text_vectors = text_vectorizer(text_column)

        return text_vectors

    def convert_text_to_vectors_glove(
        self,
        text_column: pd.Series,
        train: bool = True,
        glove_tokenizer: Tokenizer = None,
        vocab_size: int = 1000,
        oov_token: str = "<OOV>",
        max_length: int = 50,
        padding_type: str = "post",
        truncation_type: str = "post",
    ) -> Tuple[Tokenizer, np.ndarray] | np.ndarray:
        """
        Converting `text_column` to vectors using `glove_tokenizer`.

        Parameters:
        ------------
        - text_column : pd.Series
            Contains the case facts.
        - train : bool, optional
            Defines whether the model will be trained or not. (if True,
            Tokenizer will be trained | else, Tokenizer will used the
            passed `glove_tokenizer`). (Default is True).
        - glove_tokenizer : Tokenizer, optional
            Trained Tokenizer layer will be used for generating embeddings of
             `text_column` if `train` is False. (Default is None).
        - vocab_size : int, optional
            Represents the number of supported vocabulary of the Tokenizer,
            any token not in this vocabulary will be treated as an out-of-vocabulary
            token(OOV). (Default is 1000).
        - oov_tokens : str, optional
            Represents the token of an out-of-vocabulary token (Default is "<OOV>").
        - max_length : int, optional
            Defins the output vector's dimension. (Default is 50).
        - padding_type : str, optional
            Defines the padding type of the vectors, if the vector size is less than
            `max_length`, the rest of the `max_length` will be padded with 0 (Default is "post").
        - truncation_type : str, optional
            Defines the truncation type of the vectors, if the vector size is more than
            `max_length`, the extra of the `max_length` will be truncated (Default is "post").

        Returns:
        --------
        - if `train` == True:
            A tuple contains the following:
                - glove_tokenizer : Tokenizer
                    Trained Tokenizer layer.
                - text_padded : np.ndarray
                    An array contains `text_column` vectors.
        - otherwise:
            text_padded : np.ndarray
                An array contains `text_column` vectors.

        Raises:
        -------
        - AssertionError
            If train is False and `glove_tokenizer` is None.
        - AssertionError
            If train is False and `glove_tokenizer` is not instance of Tokenizer.
        """

        if train:
            glove_tokenizer = Tokenizer(
                num_words=vocab_size, oov_token=oov_token)
            glove_tokenizer.fit_on_texts(text_column)
            text_sequences = glove_tokenizer.texts_to_sequences(text_column)
            text_padded = pad_sequences(
                text_sequences,
                maxlen=max_length,
                padding=padding_type,
                truncating=truncation_type,
            )

            return glove_tokenizer, text_padded

        assert (
            glove_tokenizer is not None
        ), "`glove_tokenizer` argument must be not None."
        assert isinstance(
            glove_tokenizer, Tokenizer
        ), "`glove_tokenizer` argument must be an instance of Tokenizer."
        text_sequences = glove_tokenizer.texts_to_sequences(text_column)
        text_padded = pad_sequences(
            text_sequences,
            maxlen=max_length,
            padding=padding_type,
            truncating=truncation_type,
        )

        return text_padded

    def balance_data(self, X_train: pd.Series, y_train: pd.Series) -> pd.DataFrame:
        """
        Balancing `X_train` and `y_train` to distribute the targets in `y_train` equally.

        Parameters:
        ------------
        - text_column : pd.Series
             Contains the case facts.
         - y_train : pd.Series
             Contains the training targets.

         Returns:
         --------
         -  shuffled_balanced_df : pd.DataFrame
             Contains the new balanced dataframe with shuffling indicies.
        """

        df = pd.concat([X_train, y_train], axis=1)

        first_party = df[df["winner_index"] == 0]
        second_party = df[df["winner_index"] == 1]

        upsample_second_party = resample(
            second_party, replace=True, n_samples=len(first_party), random_state=42
        )

        upsample_df = pd.concat([upsample_second_party, first_party])

        shuffled_indices = np.arange(upsample_df.shape[0])
        np.random.shuffle(shuffled_indices)

        shuffled_balanced_df = upsample_df.iloc[shuffled_indices, :]

        return shuffled_balanced_df

    def anonymize_data(
        self,
        first_party_names: pd.Series,
        second_party_names: pd.Series,
        text_column: pd.Series,
    ) -> pd.Series:
        """
        Anonymize `text_column` by replacing `first_party_names` and
        `second_party_names` wit "_PARTY_" tag.

        Parameters:
        ------------
        - first_party_names : pd.Series
            Contains all first party names needed to be anonymized.
        - second_party_names : pd.Series
            Contains all second party names needed to be anonymized.
        - text_column : pd.Series
            Contains all texts needed to be anonymized.

        Returns:
        --------
        - all_anonyimzed_facts : pd.Series
            Contains anonymized version of `text_column`.
        """

        all_anonymized_facts = []

        for i in range(text_column.shape[0]):
            facts = text_column.iloc[i]
            first_party_name = first_party_names.iloc[i]
            second_party_name = second_party_names.iloc[i]
            anonymized_facts = self._anonymize_case_facts(
                first_party_name, second_party_name, facts
            )
            all_anonymized_facts.append(anonymized_facts)

        return pd.Series(all_anonymized_facts)

    def preprocess_data(self, text_column: pd.Series) -> pd.Series:
        """
        Preprocessing & cleaning all texts in `text_column`.

        Parameters:
        ------------
        - text_column : pd.Series
            Contains all case facts.

        Returns:
        --------
        - preprocessed_text : pd.Series
            Contains all texts after being processed.
        """

        preprocessed_text = text_column.apply(self._preprocess_text)
        return preprocessed_text