Source code for vtool.clustering2

# -*- coding: utf-8 -*-
# LICENCE
"""
TODO:
    Does HDBSCAN work on 128 dim vectors?
    http://nbviewer.jupyter.org/github/lmcinnes/hdbscan/blob/master/notebooks/Comparing%20Clustering%20Algorithms.ipynb
"""
from __future__ import absolute_import, division, print_function, unicode_literals
from six.moves import zip, map  # NOQA
import ubelt as ub
import numpy as np
import utool as ut

from vtool._pyflann_backend import FLANN_CLS


[docs]def tune_flann2(data):
    flann = FLANN_CLS()
    flann_atkwargs = dict(
        algorithm='autotuned',
        target_precision=0.6,
        build_weight=0.01,
        memory_weight=0.0,
        sample_fraction=0.001,
    )
    print(flann_atkwargs)
    print('Autotuning flann')
    tuned_params = flann.build_index(data, **flann_atkwargs)
    return tuned_params


[docs]class AnnoyWraper(object):
    """
    flann-like interface to annnoy
    """

    def __init__(self):
        pass

[docs]    def build_annoy(self, centroids, trees=3):
        import annoy

        self.a = annoy.AnnoyIndex(centroids.shape[1], metric='euclidean')
        for i, v in enumerate(centroids):
            self.a.add_item(i, v)
        self.a.build(trees)

[docs]    def query_annoy(self, query_vecs, num, checks=-1):
        a = self.a
        index_list = []
        dist_list = []
        for v in query_vecs:
            idx, dist = a.get_nns_by_vector(
                v, num, search_k=checks, include_distances=True
            )
            index_list.append(idx)
            dist_list.append(dist)
        return np.array(index_list), np.array(dist_list) ** 2

[docs]    def nn(self, data_vecs, query_vecs, num, trees=3, checks=-1):
        self.build_annoy(data_vecs, trees)
        return self.query_annoy(query_vecs, num, checks)


[docs]def jagged_group(groupids_list):
    """flattens and returns group indexes into the flattened list"""
    # flatx2_itemx = np.array(list(ub.flatten(itemxs_iter)))
    flatids = np.array(list(ub.flatten(groupids_list)))
    keys, groupxs = group_indices(flatids)
    return keys, groupxs


[docs]def apply_jagged_grouping(unflat_items, groupxs):
    """takes unflat_list and flat group indices. Returns the unflat grouping"""
    flat_items = np.array(list(ub.flatten(unflat_items)))
    item_groups = apply_grouping(flat_items, groupxs)
    return item_groups
    # itemxs_iter = [[count] * len(idx2_groupid) for count, idx2_groupid in enumerate(groupids_list)]


[docs]def groupedzip(id_list, datas_list):
    r"""
    Function for grouping multiple lists of data (stored in ``datas_list``)
    using ``id_list``.

    Args:
        id_list (list):
        datas_list (list):

    Returns:
        iterator: _iter

    CommandLine:
        python -m vtool.clustering2 --test-groupedzip

    Example:
        >>> # ENABLE_DOCTEST
        >>> from vtool.clustering2 import *  # NOQA
        >>> # build test data
        >>> id_list = np.array([1, 2, 1, 2, 1, 2, 3])
        >>> datas_list = [
        ...     ['a', 'b', 'c', 'd', 'e', 'f', 'g'],
        ...     ['A', 'B', 'C', 'D', 'E', 'F', 'G'],
        ... ]
        >>> # execute function
        >>> groupxs, grouped_iter = groupedzip(id_list, datas_list)
        >>> grouped_tuples = list(grouped_iter)
        >>> # verify results
        >>> result = str(groupxs) + '\n'
        >>> result += ub.repr2(grouped_tuples, nl=1)
        >>> print(result)
        [1 2 3]
        [
            (['a', 'c', 'e'], ['A', 'C', 'E']),
            (['b', 'd', 'f'], ['B', 'D', 'F']),
            (['g'], ['G']),
        ]
    """
    unique_ids, groupxs = group_indices(id_list)
    grouped_datas_list = [apply_grouping_(data, groupxs) for data in datas_list]
    grouped_iter = zip(*grouped_datas_list)
    return unique_ids, grouped_iter


[docs]def group_indices(idx2_groupid, assume_sorted=False):
    r"""
    group_indices

    Args:
        idx2_groupid (ndarray): numpy array of group ids (must be numeric)

    Returns:
        tuple (ndarray, list of ndarrays): (keys, groupxs)

    CommandLine:
        xdoctest -m ~/code/vtool/vtool/clustering2.py group_indices
        xdoctest -m ~/code/vtool/vtool/clustering2.py group_indices:0
        xdoctest -m ~/code/vtool/vtool/clustering2.py group_indices:1

    Example:
        >>> # ENABLE_DOCTEST
        >>> from vtool.clustering2 import *  # NOQA
        >>> idx2_groupid = np.array([2, 1, 2, 1, 2, 1, 2, 3, 3, 3, 3])
        >>> (keys, groupxs) = group_indices(idx2_groupid)
        >>> result = ut.repr2((keys, groupxs), nl=2, nobr=True, with_dtype=True)
        >>> print(result)
        np.array([1, 2, 3], dtype=np.int64),
        [
            np.array([1, 3, 5], dtype=np.int64),
            np.array([0, 2, 4, 6], dtype=np.int64),
            np.array([ 7,  8,  9, 10], dtype=np.int64),
        ],

    Example:
        >>> # ENABLE_DOCTEST
        >>> from vtool.clustering2 import *  # NOQA
        >>> idx2_groupid = np.array([[  24], [ 129], [ 659], [ 659], [ 24],
        ...       [659], [ 659], [ 822], [ 659], [ 659], [24]])
        >>> # 2d arrays must be flattened before coming into this function so
        >>> # information is on the last axis
        >>> (keys, groupxs) = group_indices(idx2_groupid.T[0])
        >>> result = ut.repr2((keys, groupxs), nl=2, nobr=True, with_dtype=True)
        >>> print(result)
        np.array([ 24, 129, 659, 822], dtype=np.int64),
        [
            np.array([ 0,  4, 10], dtype=np.int64),
            np.array([1], dtype=np.int64),
            np.array([2, 3, 5, 6, 8, 9], dtype=np.int64),
            np.array([7], dtype=np.int64),
        ],

    Example:
        >>> # ENABLE_DOCTEST
        >>> from vtool.clustering2 import *  # NOQA
        >>> idx2_groupid = np.array([True, True, False, True, False, False, True])
        >>> (keys, groupxs) = group_indices(idx2_groupid)
        >>> result = ut.repr2((keys, groupxs), nl=2, nobr=True, with_dtype=True)
        >>> print(result)
        np.array([False,  True], dtype=np.bool),
        [
            np.array([2, 4, 5], dtype=np.int64),
            np.array([0, 1, 3, 6], dtype=np.int64),
        ],

    Time:
        >>> # xdoctest: +SKIP
        >>> import vtool as vt
        >>> setup = ut.extract_timeit_setup(vt.group_indices, 2, 'groupxs =')
        >>> print(setup)
        >>> stmt_list = ut.codeblock(
                '''
                [sortx[lx:rx] for lx, rx in ut.itertwo(idxs)]
                [sortx[lx:rx] for lx, rx in zip(idxs, idxs[1:])]
                #[sortx[lx:rx] for lx, rx in ut.iter_window(idxs)]
                #[sortx[slice(*_)] for _ in ut.itertwo(idxs)]
                #[sortx[slice(lr, lx)] for lr, lx in ut.itertwo(idxs)]
                #np.split(sortx, idxs[1:-1])
                #np.hsplit(sortx, idxs[1:-1])
                np.array_split(sortx, idxs[1:-1])
                ''').split('\n')
        >>> stmt_list = [x for x in stmt_list if not x.startswith('#')]
        >>> passed, times, outputs = ut.timeit_compare(stmt_list, setup, iterations=10000)

        >>> # xdoctest: +SKIP
        >>> stmt_list = ut.codeblock(
                '''
                np.diff(groupids_sorted)
                np.ediff1d(groupids_sorted)
                np.subtract(groupids_sorted[1:], groupids_sorted[:-1])
                ''').split('\n')
        >>> stmt_list = [x for x in stmt_list if not x.startswith('#')]
        >>> passed, times, outputs = ut.timeit_compare(stmt_list, setup, iterations=10000)

    Ignore:
        import numba
        group_indices_numba = numba.jit(group_indices)
        group_indices_numba(idx2_groupid)

    SeeAlso:
        apply_grouping

    References:
        http://stackoverflow.com/questions/4651683/
        numpy-grouping-using-itertools-groupby-performance

    TODO:
        Look into np.split
        http://stackoverflow.com/questions/21888406/
        getting-the-indexes-to-the-duplicate-columns-of-a-numpy-array
    """
    # Sort items and idx2_groupid by groupid
    if assume_sorted:
        sortx = np.arange(len(idx2_groupid))
        groupids_sorted = idx2_groupid
    else:
        sortx = idx2_groupid.argsort()
        groupids_sorted = idx2_groupid.take(sortx)

    # Ensure bools are internally cast to integers
    if groupids_sorted.dtype.kind == 'b':
        cast_groupids = groupids_sorted.astype(np.int8)
    else:
        cast_groupids = groupids_sorted

    num_items = idx2_groupid.size
    # Find the boundaries between groups
    diff = np.ones(num_items + 1, cast_groupids.dtype)
    np.subtract(cast_groupids[1:], cast_groupids[:-1], out=diff[1:num_items])
    idxs = np.flatnonzero(diff)
    # Groups are between bounding indexes
    # <len(keys) bottlneck>
    groupxs = [sortx[lx:rx] for lx, rx in ut.itertwo(idxs)]  # 34.5%
    # Unique group keys
    keys = groupids_sorted[idxs[:-1]]
    return keys, groupxs


[docs]def sorted_indices_ranges(groupids_sorted):
    """
    Like group sorted indices but returns a list of slices
    """
    num_items = groupids_sorted.size
    # Ensure bools are cast to integers
    dtype = np.find_common_type([], [groupids_sorted.dtype, np.int8])
    # Find the boundaries between groups
    diff = np.ones(num_items + 1, dtype)
    np.subtract(groupids_sorted[1:], groupids_sorted[:-1], out=diff[1:num_items])
    idxs = np.flatnonzero(diff)
    group_ranges = [(lx, rx) for lx, rx in ut.itertwo(idxs)]  # 34.5%
    return group_ranges


[docs]def find_duplicate_items(item_arr):
    """
    Args:
        item_arr (?):

    Returns:
        ?: duplicate_items

    CommandLine:
        python -m vtool.clustering2 --test-find_duplicate_items

    References:
        http://stackoverflow.com/questions/21888406/getting-the-indexes-to-the-duplicate-columns-of-a-numpy-array

    Example:
        >>> # DISABLE_DOCTEST
        >>> from vtool.clustering2 import *  # NOQA
        >>> np.random.seed(0)
        >>> item_arr = np.random.randint(100, size=30)
        >>> duplicate_items = find_duplicate_items(item_arr)
        >>> assert duplicate_items == list(six.iterkeys(ut.find_duplicate_items(item_arr)))
        >>> result = str(duplicate_items)
        >>> print(result)
        [9, 67, 87, 88]
    """
    sortx = item_arr.argsort()
    groupids_sorted = item_arr.take(sortx)

    # duplicate_idxs = np.flatnonzero(~np.diff(groupids_sorted).astype(np.bool_))
    diff = np.diff(groupids_sorted)
    # notdiff = np.bitwise_not(diff.astype(np.bool_))
    edges = np.flatnonzero(diff.astype(np.bool_)) + 1
    duplicate_items = [
        group[0] for group in np.split(groupids_sorted, edges) if group.shape[0] > 1
    ]
    # duplicate_items = groupids_sorted.take(duplicate_idxs)
    return duplicate_items


[docs]def apply_grouping(items, groupxs, axis=0):
    """
    applies grouping from group_indicies
    apply_grouping

    Args:
        items (ndarray):
        groupxs (list of ndarrays):

    Returns:
        list of ndarrays: grouped items

    SeeAlso:
        group_indices
        invert_apply_grouping

    CommandLine:
        python -m vtool.clustering2 --test-apply_grouping

    Example:
        >>> # ENABLE_DOCTEST
        >>> from vtool.clustering2 import *  # NOQA
        >>> idx2_groupid = np.array([2, 1, 2, 1, 2, 1, 2, 3, 3, 3, 3])
        >>> items        = np.array([1, 8, 5, 5, 8, 6, 7, 5, 3, 0, 9])
        >>> (keys, groupxs) = group_indices(idx2_groupid)
        >>> grouped_items = apply_grouping(items, groupxs)
        >>> result = str(grouped_items)
        >>> print(result)
        [array([8, 5, 6]), array([1, 5, 8, 7]), array([5, 3, 0, 9])]
    """
    # SHOULD DO A CONTIGUOUS CHECK HERE
    # items_ = np.ascontiguousarray(items)
    return [items.take(xs, axis=axis) for xs in groupxs]
    # return [items[idxs] for idxs in groupxs]


[docs]def apply_grouping_(items, groupxs):
    """non-optimized version"""
    return ut.apply_grouping(items, groupxs)


[docs]def invert_apply_grouping(grouped_items, groupxs):
    r"""
    Args:
        grouped_items (list): of lists
        groupxs (list): of lists

    Returns:
        list: items

    CommandLine:
        python -m vtool.clustering2 --test-invert_apply_grouping

    Example:
        >>> # ENABLE_DOCTEST
        >>> from vtool.clustering2 import *  # NOQA
        >>> grouped_items = [[8, 5, 6], [1, 5, 8, 7], [5, 3, 0, 9]]
        >>> groupxs = [np.array([1, 3, 5]), np.array([0, 2, 4, 6]), np.array([ 7,  8,  9, 10])]
        >>> items = invert_apply_grouping(grouped_items, groupxs)
        >>> result = items
        >>> print(result)
        [1, 8, 5, 5, 8, 6, 7, 5, 3, 0, 9]

    Example:
        >>> # ENABLE_DOCTEST
        >>> from vtool.clustering2 import *  # NOQA
        >>> grouped_items, groupxs = [], []
        >>> result = invert_apply_grouping(grouped_items, groupxs)
        >>> print(result)
        []
    """
    if len(grouped_items) == 0:
        assert (
            len(groupxs) == 0
        ), 'inconsistant. len(grouped_items)=%d, len(groupxs)=%d' % (
            len(grouped_items),
            len(groupxs),
        )
        return []
    # maxval = max(map(max, groupxs))
    maxval = _max(list(map(_max, groupxs)))
    ungrouped_items = [None] * (maxval + 1)  # np.full((maxval + 1,), None)
    for itemgroup, xs in zip(grouped_items, groupxs):
        for item, x in zip(itemgroup, xs):
            ungrouped_items[x] = item
    return ungrouped_items


[docs]def invert_apply_grouping3(grouped_items, groupxs, maxval):
    ungrouped_items = [None] * (maxval + 1)  # np.full((maxval + 1,), None)
    for itemgroup, xs in zip(grouped_items, groupxs):
        for item, x in zip(itemgroup, xs):
            ungrouped_items[x] = item
    return ungrouped_items


def _max(x):
    return np.max(x) if len(x) > 0 else 0


[docs]def invert_apply_grouping2(grouped_items, groupxs, dtype=None):
    """use only when ungrouping will be complete"""
    maxval = _max(list(map(_max, groupxs)))
    ungrouped_items = np.zeros((maxval + 1,), dtype=dtype)
    for itemgroup, ix_list in zip(grouped_items, groupxs):
        ungrouped_items[ix_list] = itemgroup
    return ungrouped_items


[docs]def apply_grouping_iter(items, groupxs):
    return (items.take(xs, axis=0) for xs in groupxs)


[docs]def apply_grouping_iter2(items, groupxs):
    return (np.array(list(items)).take(xs, axis=0) for xs in groupxs)


[docs]def groupby(items, idx2_groupid):
    """
    >>> items    = np.array(np.arange(100))
    >>> idx2_groupid = np.array(np.random.randint(0, 4, size=100))
    >>> items = idx2_groupid
    """
    keys, groupxs = group_indices(idx2_groupid)
    vals = [items[idxs] for idxs in groupxs]
    return keys, vals


[docs]def groupby_gen(items, idx2_groupid):
    """
    >>> items    = np.array(np.arange(100))
    >>> idx2_groupid = np.array(np.random.randint(0, 4, size=100))
    """
    for key, val in zip(*groupby(items, idx2_groupid)):
        yield (key, val)


[docs]def groupby_dict(items, idx2_groupid):
    # Build a dict
    grouped = {key: val for key, val in groupby_gen(items, idx2_groupid)}
    return grouped


# ---------------
# Plotting Code
# ---------------


[docs]def plot_centroids(
    data, centroids, num_pca_dims=3, whiten=False, labels='centroids', fnum=1, prefix=''
):
    """Plots centroids and datapoints. Plots accurately up to 3 dimensions.
    If there are more than 3 dimensions, PCA is used to recude the dimenionality
    to the <num_pca_dims> principal components
    """
    # http://www.janeriksolem.net/2012/03/isomap-with-scikit-learn.html
    from wbia.plottool import draw_func2 as df2

    data_dims = data.shape[1]
    show_dims = min(num_pca_dims, data_dims)
    if data_dims != show_dims:
        # we can't physiologically see the data, so look at a projection
        print('[akmeans] Doing PCA')
        from sklearn import decomposition

        pcakw = dict(copy=True, n_components=show_dims, whiten=whiten)
        pca = decomposition.PCA(**pcakw).fit(data)
        pca_data = pca.transform(data)
        pca_centroids = pca.transform(centroids)
        print('[akmeans] ...Finished PCA')
    else:
        # pca is not necessary
        print('[akmeans] No need for PCA')
        pca_data = data
        pca_centroids = centroids
    print(pca_data.shape)
    # Make a color for each centroid
    data_x = pca_data[:, 0]
    data_y = pca_data[:, 1]
    clus_x = pca_centroids[:, 0]
    clus_y = pca_centroids[:, 1]
    nCentroids = K = len(centroids)
    if labels == 'centroids':
        (datax2_label, dists) = FLANN_CLS().nn(centroids, data, 1)
    else:
        datax2_label = labels
    datax2_label = np.array(datax2_label, dtype=np.int32)
    print(datax2_label)
    assert len(datax2_label.shape) == 1, repr(datax2_label.shape)
    # if datax2_centroids is None:
    #    (datax2_centroidx, _) = p FLANN_CLS().nn(centroids, data, 1)
    # data_colors = colors[np.array(datax2_centroidx, dtype=np.int32)]
    nColors = datax2_label.max() - datax2_label.min() + 1
    print('nColors=%r' % (nColors,))
    print('K=%r' % (K,))
    colors = np.array(df2.distinct_colors(nColors, brightness=0.95))
    clus_colors = np.array(df2.distinct_colors(nCentroids, brightness=0.95))
    assert labels != 'centroids' or nColors == K
    assert len(datax2_label.shape) == 1, repr(datax2_label.shape)
    data_colors = colors[datax2_label]
    # Create a figure
    fig = df2.figure(fnum, doclf=True, docla=True)
    if show_dims == 2:
        ax = df2.plt.gca()
        df2.plt.scatter(data_x, data_y, s=20, c=data_colors, marker='o', alpha=1)
        # df2.plt.scatter(data_x, data_y, s=20,  c=data_colors, marker='o', alpha=.2)
        df2.plt.scatter(clus_x, clus_y, s=500, c=clus_colors, marker='*')
        ax.autoscale(enable=False)
        ax.set_aspect('equal')
        df2.dark_background(ax)
    if show_dims == 3:
        from mpl_toolkits.mplot3d import Axes3D  # NOQA

        ax = fig.add_subplot(111, projection='3d')
        data_z = pca_data[:, 2]
        clus_z = pca_centroids[:, 2]
        # ax.scatter(data_x, data_y, data_z, s=20,  c=data_colors, marker='o', alpha=.2)
        ax.scatter(data_x, data_y, data_z, s=20, c=data_colors, marker='o', alpha=1)
        ax.scatter(clus_x, clus_y, clus_z, s=500, c=clus_colors, marker='*')
        ax.autoscale(enable=False)
        ax.set_aspect('equal')
        df2.dark_background(ax)
        # ax.set_alpha(.1)
        # ax.set_frame_on(False)
    ax = df2.plt.gca()
    waswhitestr = ' +whitening' * whiten
    titlestr = (
        '{prefix}AKmeans: K={K}.'
        'PCA projection {data_dims}D -> {show_dims}D'
        '{waswhitestr}'
    ).format(**locals())
    ax.set_title(titlestr)
    return fig


[docs]def uniform_sample_hypersphere(num, ndim=2, only_quadrent_1=False):
    r"""
    Not quite done yet

    References:
        https://en.wikipedia.org/wiki/Regular_polytope
        https://en.wikipedia.org/wiki/Platonic_solid#Higher_dimensions
        https://en.wikipedia.org/wiki/Cross-polytope

    Args:
        num (?):
        ndim (int): (default = 2)

    CommandLine:
        python -m vtool.clustering2 --test-uniform_sampe_hypersphere

    Ignore:
        #pip install polytope
        sudo pip install cvxopt  --no-deps

    Example:
        >>> # DISABLE_DOCTEST
        >>> from vtool.clustering2 import *  # NOQA
        >>> num = 100
        >>> ndim = 3
        >>> pts = uniform_sampe_hypersphere(num, ndim)
        >>> print(pts)
        >>> # xdoctest: +REQUIRES(--show)
        >>> import wbia.plottool as pt
        >>> if ndim == 2:
        >>>     pt.plot(pts.T[0], pts.T[1], 'gx')
        >>> elif ndim == 3:
        >>>     #pt.plot_surface3d(pts.T[0], pts.T[1], pts.T[2])
        >>>     from mpl_toolkits.mplot3d import Axes3D  # NOQA
        >>>     fig = pt.figure(1, doclf=True, docla=True)
        >>>     ax = fig.add_subplot(111, projection='3d')
        >>>     ax.scatter(pts.T[0], pts.T[1], pts.T[2], s=20, marker='o', alpha=1)
        >>>     ax.autoscale(enable=False)
        >>>     ax.set_aspect('equal')
        >>>     df2.dark_background(ax)
        >>> pt.dark_background()
        >>> ut.show_if_requested()
    """
    import vtool as vt

    pts = np.random.rand(num, ndim)
    if not only_quadrent_1:
        pts = pts * 2 - 1
        pass
    pts = vt.normalize_rows(pts)
    return pts


[docs]def unsupervised_multicut_labeling(cost_matrix, thresh=0):
    """
    Notes:
        requires CPLEX

    CommandLine:
        python -m vtool.clustering2 unsupervised_multicut_labeling --show

    Ignore:

        >>> # synthetic data
        >>> import vtool as vt
        >>> size = 100
        >>> thresh = 50
        >>> np.random.randint(0, 1)
        >>> np.zeros((size, size))
        >>> #np.random.rand(size, size)
        >>> size = 45
        >>> #size = 10
        >>> size = 5
        >>> aids = np.arange(size)
        >>> rng = np.random.RandomState(443284320)
        >>> encounter_lbls = rng.randint(0, size, size)
        >>> separation = 5.0
        >>> separation = 1.10
        >>> grid1 = np.tile(encounter_lbls, (size, 1))
        >>> is_match = grid1.T == grid1
        >>> good_pos = np.where(is_match)
        >>> bad_pos = np.where(~s_match)
        >>> cost_matrix_ = np.zeros((size, size))
        >>> cost_matrix_[good_pos] = rng.randn(len(good_pos[0])) + separation
        >>> cost_matrix_[bad_pos] = rng.randn(len(bad_pos[0])) - separation
        >>> false_val = min(cost_matrix_.min(), np.min(rng.randn(1000) - separation))
        >>> true_val = max(cost_matrix_.max(), np.max(rng.randn(500) + separation))
        >>> cost_matrix_[np.diag_indices_from(cost_matrix_)] = true_val
        >>> #cost_matrix_[np.diag_indices_from(cost_matrix_)] = np.inf
        >>> cost_matrix = (cost_matrix_ - false_val) / (true_val - false_val)
        >>> cost_matrix = 2 * (cost_matrix - .5)
        >>> thresh = 0
        >>> labels = vt.unsupervised_multicut_labeling(cost_matrix, thresh)
        >>> diff = ut.find_group_differences(
        >>>     list(ut.group_items(aids, encounter_lbls).values()),
        >>>     list(ut.group_items(aids, labels).values()))
        >>> print('diff = %r' % (diff,))

        #gm, = ut.exec_func_src(vt.unsupervised_multicut_labeling,
        #key_list=['gm'], sentinal='inf = opengm')
        #parameter = opengm.InfParam()
        #%timeit opengm.inference.Multicut(gm, parameter=parameter).infer()

    Example:
        >>> # SCRIPT
        >>> from vtool.clustering2 import *  # NOQA
        >>> import networkx as nx
        >>> import wbia.plottool as pt
        >>> rng = np.random.RandomState(443284320)
        >>> pt.ensureqt()
        >>> #
        >>> def make_test_costmatrix(name_labels, view_labels, separation=2):
        >>>     is_same = name_labels == name_labels[:, None]
        >>>     is_comp = np.abs(view_labels - view_labels[:, None]) <= 1
        >>>     good_pos = np.where(is_same)
        >>>     bad_pos = np.where(~is_same)
        >>>     cost_matrix_ = np.zeros((len(name_labels), len(name_labels)))
        >>>     cost_matrix_[good_pos] = rng.randn(len(good_pos[0])) + separation
        >>>     cost_matrix_[bad_pos] = rng.randn(len(bad_pos[0])) - separation
        >>>     cost_matrix_ = (cost_matrix_.T + cost_matrix_) / 2
        >>>     false_val = min(cost_matrix_.min(), np.min(rng.randn(1000) - separation))
        >>>     true_val = max(cost_matrix_.max(), np.max(rng.randn(500) + separation))
        >>>     cost_matrix_[np.diag_indices_from(cost_matrix_)] = true_val
        >>>     cost_matrix = (cost_matrix_ - false_val) / (true_val - false_val)
        >>>     cost_matrix = 2 * (cost_matrix - .5)
        >>>     cost_matrix[np.where(~is_comp)] = 0
        >>>     return cost_matrix
        >>> #
        >>> view_labels = np.array([0, 0, 2, 2, 1, 0, 0, 0])
        >>> name_labels = np.array([0, 0, 0, 0, 0, 1, 1, 1])
        >>> #cost_matrix = make_test_costmatrix(name_labels, view_labels, 2)
        >>> cost_matrix = make_test_costmatrix(name_labels, view_labels, .9)
        >>> #
        >>> def multicut_value(cost_matrix, name_labels):
        >>>     grid1 = np.tile(name_labels, (len(name_labels), 1))
        >>>     isdiff = grid1.T != grid1
        >>>     cut_value = cost_matrix[isdiff].sum()
        >>>     return cut_value
        >>> #
        >>> aids = np.arange(len(name_labels))
        >>> #
        >>> graph = ut.nx_from_matrix(cost_matrix)
        >>> weights = nx.get_edge_attributes(graph, 'weight')
        >>> #
        >>> floatfmt1 = ut.partial(ub.map_vals, lambda x: 'w=%.2f' % x)
        >>> floatfmt2 = ut.partial(ub.map_vals, lambda x: 'l=%.2f' % x)
        >>> #
        >>> lens = ub.map_vals(lambda x: (1 - ((x + 1) / 2)) / 2, weights)
        >>> labels = floatfmt1(weights)
        >>> #labels = floatfmt2(lens)
        >>> nx.set_edge_attributes(graph, name='label', values=labels)
        >>> #nx.set_edge_attributes(graph, name='len', values=lens)
        >>> nx.set_node_attributes(graph, name='shape', values='ellipse')
        >>> encounter_lbls_str = [str(x) for x in name_labels]
        >>> node_name_lbls = dict(zip(aids, encounter_lbls_str))
        >>> import vtool as vt
        >>> #
        >>> mcut_labels = vt.unsupervised_multicut_labeling(cost_matrix, thresh=vt.eps)
        >>> diff = ut.find_group_differences(
        >>>     list(ut.group_items(aids, name_labels).values()),
        >>>     list(ut.group_items(aids, mcut_labels).values()))
        >>> print('diff = %r' % (diff,))
        >>> #
        >>> nx.set_node_attributes(graph, name='label', values=node_name_lbls)
        >>> node_mcut_lbls = dict(zip(aids, mcut_labels))
        >>> nx.set_node_attributes(graph, name='mcut_label', values=node_mcut_lbls)
        >>> #
        >>> print('mc_val(name) ' + str(multicut_value(cost_matrix, name_labels)))
        >>> print('mc_val(mcut) ' + str(multicut_value(cost_matrix, mcut_labels)))
        >>> #
        >>> ut.color_nodes(graph, 'mcut_label')
        >>> #
        >>> # remove noncomparable edges
        >>> is_comp = np.abs(view_labels - view_labels[:, None]) <= 1
        >>> #
        >>> noncomp_edges = list(zip(*np.where(~is_comp)))
        >>> graph.remove_edges_from(noncomp_edges)
        >>> #
        >>> layoutkw = {
        >>>     'sep' : 5,
        >>>     'prog': 'neato',
        >>>     'overlap': 'false',
        >>>     'splines': 'spline',
        >>> }
        >>> pt.show_nx(graph, layoutkw=layoutkw)
        >>> ut.show_if_requested()

    """
    import opengm
    import numpy as np

    # import wbia.plottool as pt
    from itertools import product

    cost_matrix_ = cost_matrix - thresh
    num_vars = len(cost_matrix_)

    # Enumerate undirected edges (node index pairs)
    var_indices = np.arange(num_vars)
    varindex_pairs = np.array(
        [
            (a1, a2)
            for a1, a2 in product(var_indices, var_indices)
            if a1 != a2 and a1 > a2
        ],
        dtype=np.uint32,
    )
    varindex_pairs.sort(axis=1)

    # Create nodes in the graphical model.  In this case there are <num_vars>
    # nodes and each node can be assigned to one of <num_vars> possible labels
    num_nodes = num_vars
    space = np.full((num_nodes,), fill_value=num_vars, dtype=opengm.index_type)
    gm = opengm.gm(space)

    # Use one potts function for each edge
    for varx1, varx2 in varindex_pairs:
        cost = cost_matrix_[varx1, varx2]
        potts_func = opengm.PottsFunction(
            (num_vars, num_vars), valueEqual=0, valueNotEqual=cost
        )
        potts_func_id = gm.addFunction(potts_func)
        var_indicies = np.array([varx1, varx2])
        gm.addFactor(potts_func_id, var_indicies)

    # pt.ensureqt()
    # opengm.visualizeGm(gm=gm)

    # Not sure what parameters are allowed to be passed here.
    parameter = opengm.InfParam()
    inf = opengm.inference.Multicut(gm, parameter=parameter)
    inf.infer()
    labels = inf.arg()
    # print(labels)
    return labels

    # def alpha_expansion_cut(graph):
    # https://github.com/amueller/gco_python/blob/master/example.py
    #    import pygco
    # prob_annots2 = prob_annots.copy()
    # finite_probs = (prob_annots2[np.isfinite(prob_annots2)])
    # mean = finite_probs.mean()
    # # make symmetric
    # prob_annots2[~np.isfinite(prob_annots2)] = finite_probs.max() * 2
    # prob_annots2 = (prob_annots2.T + prob_annots2) / 2
    # int_factor = 100 / mean
    # pairwise_cost = (prob_annots2 * int_factor).astype(np.int32)
    # n_labels = 2
    # unary_cost = np.ones((prob_annots.shape[0], n_labels)).astype(np.int32)
    # u, v = np.meshgrid(np.arange(prob_annots.shape[0]).astype(np.int32), np.arange(len(prob_annots)).astype(np.int32))
    # edges = np.vstack((u.flatten(), v.flatten(), pairwise_cost.flatten())).T.astype(np.int32)

    # import pygco
    # n_iter = 5
    # algorithm = 'expansion'
    # unary_cost = np.ascontiguousarray(unary_cost)
    ##pairwise_cost = np.ascontiguousarray(pairwise_cost)
    # pairwise_cost = np.eye(n_labels).astype(np.int32)
    # edges = np.ascontiguousarray(edges)
    # pygco.cut_from_graph(edges, unary_cost, pairwise_cost, n_iter, algorithm)
    # pairwise_cost = prob_annots
    pass


[docs]def example_binary():
    import matplotlib.pyplot as plt
    import numpy as np
    from pygco import cut_simple, cut_from_graph

    # generate trivial data
    x = np.ones((10, 10))
    x[:, 5:] = -1
    x_noisy = x + np.random.normal(0, 0.8, size=x.shape)
    x_thresh = x_noisy > 0.0

    # create unaries
    unaries = x_noisy
    # as we convert to int, we need to multipy to get sensible values
    unaries = (10 * np.dstack([unaries, -unaries]).copy('C')).astype(np.int32)
    unaries[:] = 0
    # create potts pairwise
    pairwise = -10 * np.eye(2, dtype=np.int32)

    # do simple cut
    result = cut_simple(unaries, pairwise)

    # use the gerneral graph algorithm
    # first, we construct the grid graph
    inds = np.arange(x.size).reshape(x.shape)
    horz = np.c_[inds[:, :-1].ravel(), inds[:, 1:].ravel()]
    vert = np.c_[inds[:-1, :].ravel(), inds[1:, :].ravel()]
    edges = np.vstack([horz, vert]).astype(np.int32)

    # we flatten the unaries
    result_graph = cut_from_graph(edges, unaries.reshape(-1, 2), pairwise)

    # plot results
    plt.subplot(231, title='original')
    plt.imshow(x, interpolation='nearest')
    plt.subplot(232, title='noisy version')
    plt.imshow(x_noisy, interpolation='nearest')
    plt.subplot(233, title='rounded to integers')
    plt.imshow(unaries[:, :, 0], interpolation='nearest')
    plt.subplot(234, title='thresholding result')
    plt.imshow(x_thresh, interpolation='nearest')
    plt.subplot(235, title='cut_simple')
    plt.imshow(result, interpolation='nearest')
    plt.subplot(236, title='cut_from_graph')
    plt.imshow(result_graph.reshape(x.shape), interpolation='nearest')

    plt.show()


if __name__ == '__main__':
    """
    CommandLine:
        python ~/code/vtool/vtool/clustering2.py all
        python -m vtool.clustering2 all
    """
    import xdoctest

    xdoctest.doctest_module(__file__)