scripts/utils.py

import time
import logging

import networkx as nx
import numpy as np

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)


def generate_candidates(X, strategy, target_rel, max_candidates, consolidate_sides=False, seed=0):
    """ Generate candidate statements from an existing knowledge graph using a defined strategy.

        Parameters
        ----------
        X: np.array, shape (n, 3)
            Triples from which to discover new facts.
        strategy: string
            The candidates generation strategy.
            - 'random_uniform' : generates N candidates (N <= max_candidates) based on a uniform random sampling of
                head and tail entities.
            - 'entity_frequency' : generates candidates by sampling entities with low frequency.
            - 'graph_degree' : generates candidates by sampling entities with a low graph degree.
            - 'cluster_coefficient' : generates candidates by sampling entities with a low clustering coefficient.
            - 'cluster_triangles' : generates candidates by sampling entities with a low number of cluster triangles.
            - 'cluster_squares' : generates candidates by sampling entities with a low number of cluster squares.
        max_candidates: int or float
            The maximum numbers of candidates generated by 'strategy'.
            Can be an absolute number or a percentage [0,1].
            This does not guarantee the number of candidates generated.
        target_rel : str
            Target relation to focus on. The function will generate candidate
             statements only with this specific relation type.
        consolidate_sides: bool
            If True will generate candidate statements as a product of
            unique head and tail entities, otherwise will
            consider head and tail entities separately. Default: False.
        seed : int
            Seed to use for reproducible results.

        Returns
        -------
        X_candidates : ndarray, shape [n, 3]
            A list of candidate statements.


        Examples
        --------
        >>> import numpy as np
        >>> from ampligraph.discovery.discovery import generate_candidates
        >>>
        >>> X = np.array([['a', 'y', 'b'],
        >>>               ['b', 'y', 'a'],
        >>>               ['a', 'y', 'c'],
        >>>               ['c', 'y', 'a'],
        >>>               ['a', 'y', 'd'],
        >>>               ['c', 'y', 'd'],
        >>>               ['b', 'y', 'c'],
        >>>               ['f', 'y', 'e']])

        >>> X_candidates = generate_candidates(X, strategy='graph_degree', target_rel='y', max_candidates=3)
        >>> ([['a', 'y', 'e'],
        >>>  ['f', 'y', 'a'],
        >>>  ['c', 'y', 'e']])

    """

    if strategy not in ['random_uniform', 'entity_frequency',
                        'graph_degree', 'cluster_coefficient',
                        'cluster_triangles', 'cluster_squares']:
        msg = '%s is not a valid candidate generation strategy.' % strategy
        raise ValueError(msg)

    if target_rel not in np.unique(X[:, 1]):
        # No error as may be case where target_rel is not in X
        msg = 'Target relation is not found in triples.'
        logger.warning(msg)

    if not isinstance(max_candidates, (float, int)):
        msg = 'Parameter max_candidates must be a float or int.'
        raise ValueError(msg)

    if max_candidates <= 0:
        msg = 'Parameter max_candidates must be a positive integer ' \
              'or float in range (0,1].'
        raise ValueError(msg)

    if isinstance(max_candidates, float):
        max_candidates = int(max_candidates * len(X))

    def _filter_candidates(X_candidates, X, remove_reflexive=True):
        """ Inner function to filter candidate statements from X_candidates that are in X.
        """
        X_candidates = _setdiff2d(X_candidates, X)
        # Filter statements that are ['x', rel, 'x']
        if remove_reflexive:
            keep_idx = np.where(X_candidates[:, 0] != X_candidates[:, 2])
            X_candidates = X_candidates[keep_idx]

        return X_candidates

    # Set random seed
    np.random.seed(seed)

    # Get entities linked with this relation
    if consolidate_sides:
        e_s = np.unique(np.concatenate((X[:, 0], X[:, 2])))
        e_o = e_s
    else:
        e_s = np.unique(X[:, 0])
        e_o = np.unique(X[:, 2])

    logger.info('Generating candidates using {} strategy.'.format(strategy))

    if strategy == 'random_uniform':

        # Take close to sqrt of max_candidates so that: len(meshgrid result) == max_candidates
        sample_size = int(np.sqrt(max_candidates) + 10)  # +10 to allow for reduction in sampled array due to filtering

        X_candidates = np.zeros([max_candidates, 3], dtype=object)  # Pre-allocate X_candidates array
        num_retries, max_retries = 0, 5  # Retry up to 5 times to reach max_candidates
        start_idx, end_idx = 0, 0  #

        while end_idx <= max_candidates - 1:
            sample_e_s = np.random.choice(e_s, size=sample_size, replace=False)
            sample_e_o = np.random.choice(e_o, size=sample_size, replace=False)

            gen_candidates = np.array(np.meshgrid(sample_e_s, target_rel, sample_e_o)).T.reshape(-1, 3)
            gen_candidates = _filter_candidates(gen_candidates, X)

            # Select either all of gen_candidates or just enough to fill X_candidates
            select_idx = min(len(gen_candidates), len(X_candidates) - start_idx)
            end_idx = start_idx + select_idx

            X_candidates[start_idx:end_idx, :] = gen_candidates[0:select_idx, :]
            start_idx = end_idx

            num_retries += 1
            if num_retries == max_retries:
                break

        # end_idx will equal max_candidates in most cases, but could be less
        return X_candidates[0:end_idx, :]

    elif strategy == 'entity_frequency':

        # Get entity counts and sort them in ascending order
        if consolidate_sides:
            e_s_counts = np.array(np.unique(X[:, [0, 2]], return_counts=True)).T
            e_o_counts = e_s_counts
        else:
            e_s_counts = np.array(np.unique(X[:, 0], return_counts=True)).T
            e_o_counts = np.array(np.unique(X[:, 2], return_counts=True)).T

        e_s_weights = e_s_counts[:, 1].astype(np.float64) / np.sum(e_s_counts[:, 1].astype(np.float64))
        e_o_weights = e_o_counts[:, 1].astype(np.float64) / np.sum(e_o_counts[:, 1].astype(np.float64))

    ## Customizable
    ## --------------------------------------------------------------------------------
    ## Add a new if clause for a new strategy.
    ## --------------------------------------------------------------------------------

    elif strategy in ['graph_degree', 'cluster_coefficient', 'cluster_triangles', 'cluster_squares']:

        # Create networkx graph
        G = nx.Graph()
        for row in X:
            G.add_nodes_from([row[0], row[2]])
            G.add_edge(row[0], row[2], name=row[1])

        # Calculate node metrics
        if strategy == 'graph_degree':
            C = {i: j for i, j in G.degree()}
        elif strategy == 'cluster_coefficient':
            C = nx.algorithms.cluster.clustering(G)
        elif strategy == 'cluster_triangles':
            C = nx.algorithms.cluster.triangles(G)
        elif strategy == 'cluster_squares':
            C = nx.algorithms.cluster.square_clustering(G)

        e_s_weights = np.array([C[x] for x in e_s], dtype=np.float64)
        e_o_weights = np.array([C[x] for x in e_o], dtype=np.float64)

        e_s_weights = e_s_weights / np.sum(e_s_weights)
        e_o_weights = e_o_weights / np.sum(e_o_weights)

    # Take close to sqrt of max_candidates so that: len(meshgrid result) == max_candidates
    sample_size = int(np.sqrt(max_candidates) + 10)  # +10 to allow for reduction in sampled array due to filtering

    X_candidates = np.zeros([max_candidates, 3], dtype=object)  # Pre-allocate X_candidates array
    num_retries, max_retries = 0, 5  # Retry up to 5 times to reach max_candidates
    start_idx, end_idx = 0, 0

    while end_idx <= max_candidates - 1:

        sample_e_s = np.random.choice(e_s, size=sample_size, replace=True, p=e_s_weights)
        sample_e_o = np.random.choice(e_o, size=sample_size, replace=True, p=e_o_weights)

        gen_candidates = np.array(np.meshgrid(sample_e_s, target_rel, sample_e_o)).T.reshape(-1, 3)
        gen_candidates = _filter_candidates(gen_candidates, X)

        # Select either all of gen_candidates or just enough to fill X_candidates
        select_idx = min(len(gen_candidates), len(X_candidates) - start_idx)
        end_idx = start_idx + select_idx

        X_candidates[start_idx:end_idx, :] = gen_candidates[0:select_idx, :]
        start_idx = end_idx

        num_retries += 1

        if num_retries == max_retries:
            break


    return X_candidates[0:end_idx, :]


def _setdiff2d(A, B):
    """ Utility function equivalent to numpy.setdiff1d on 2d arrays.

    Parameters
    ----------

    A : ndarray, shape [n, m]

    B : ndarray, shape [n, m]

    Returns
    -------
    np.array, shape [k, m]
        Rows of A that are not in B.

    """

    if len(A.shape) != 2 or len(B.shape) != 2:
        raise RuntimeError('Input arrays must be 2-dimensional.')

    tmp = np.prod(np.swapaxes(A[:, :, None], 1, 2) == B, axis=2)
    return A[~ np.sum(np.cumsum(tmp, axis=0) * tmp == 1, axis=1).astype(bool)]


def flatten_list(lst):
    all_disc = []
    for i in lst:
        for j in i:
            all_disc.append(j)

    return all_disc