Source code for hyperlearn.metrics.euclidean


from ..utils import _XXT, rowSum, reflect
from numpy import zeros, newaxis
from numba import njit, prange
from ..sparse.csr import _XXT as _XXT_sparse, rowSum as rowSum_sparse
from ..sparse.tcsr import _XXT as _XXT_triangular
from ..numba import maximum


[docs]@njit(fastmath = True, nogil = True, cache = True) def mult_minus2(XXT): """ [Added 17/10/2018] Quickly multiplies XXT by -2. Uses notion that XXT is symmetric, hence only lower triangular is multiplied. """ n = len(XXT) for i in range(n): for j in range(i): XXT[i, j] *= -2 return XXT
def _maximum0(XXT, squared = True): """ [Added 15/10/2018] [Edited 21/10/2018] Computes maxmimum(XXT, 0) faster. Much faster than Sklearn since uses the notion that distance(X, X) is symmetric. Steps: maximum(XXT, 0) Optimised. Instead of n^2 operations, does n(n-1)/2 operations. """ n = len(XXT) for i in prange(n): XXT[i, i] = 0 for j in range(i): if XXT[i, j] < 0: XXT[i, j] = 0 if not squared: XXT[i, j] **= 0.5 return XXT maximum0 = njit(_maximum0, fastmath = True, nogil = True, cache = True) maximum0_parallel = njit(_maximum0, fastmath = True, nogil = True, parallel = True)
[docs]def euclidean_triangular(S, D, squared = False): """ [Added 21/10/2018] Quickly performs -2D + X^2 + X.T^2 on the TCSR matrix. Also applies maximum(D, 0) and then square roots distances if required. """ # Apply -2*D and add row-wise rowSum n = len(S) move = 0 # loop *-2 and adds S[:, newaxis] for i in prange(n-1): i1 = i+1 left = i*i1 // 2 s = S[i1] for j in range(left, left+i1): # mult by -2 D[j] *= -2 # add S[:, newaxis] D[j] += s # loop adds S[newaxis, :] for a in prange(n-1): s = S[a] for b in range(a, n-1): # add S[newaxis, :] or S c = b*(b+1) // 2 + a D[c] += s # maximum(D, 0) if D[c] < 0: D[c] = 0 if not squared: D[c] **= 0.5 return D
euclidean_triangular_single = njit(euclidean_triangular, fastmath = True, nogil = True, cache = True) euclidean_triangular_parallel = njit(euclidean_triangular, fastmath = True, nogil = True, parallel = True)
[docs]def euclidean_distances(X, Y = None, triangular = False, squared = False, n_jobs = 1): """ [Added 15/10/2018] [Edited 16/10/2018] [Edited 22/10/2018 Added Y option] Notice: parsing in Y will result in only 10% - 15% speed improvement, not 30%. Much much faster than Sklearn's implementation. Approx not 30% faster. Probably even faster if using n_jobs = -1. Uses the idea that distance(X, X) is symmetric, and thus algorithm runs only on 1/2 triangular part. Old complexity: X @ XT n^2p rowSum(X^2) np XXT*-2 n^2 XXT+X^2 2n^2 maximum(XXT,0) n^2 n^2p + 4n^2 + np New complexity: sym X @ XT n^2p/2 rowSum(X^2) np sym XXT*-2 n^2/2 sym XXT+X^2 n^2 maximum(XXT,0) n^2/2 n^2p/2 + 2n^2 + np So New complexity approx= 1/2(Old complexity) """ S = rowSum(X) if Y is X: # if X == Y, then defaults to fast triangular L2 distance algo Y = None if Y is None: XXT = _XXT(X.T) XXT = mult_minus2(XXT) XXT += S[:, newaxis] XXT += S #[newaxis,:] D = maximum0_parallel(XXT, squared) if n_jobs != 1 else maximum0(XXT, squared) if not triangular: D = reflect(XXT, n_jobs) else: D = X @ Y.T D *= -2 D += S[:, newaxis] D += rowSum(Y) D = maximum(D, 0) if not squared: D **= 0.5 return D
[docs]def euclidean_distances_sparse(val, colPointer, rowIndices, n, p, triangular = False, dense_output = True, squared = False, n_jobs = 1): """ [Added 15/10/2018] [Edited 21/10/2018] Much much faster than Sklearn's implementation. Approx not 60% faster. Probably even faster if using n_jobs = -1 (actually 73% faster). [n = 10,000 p = 1,000] Uses the idea that distance(X, X) is symmetric, and thus algorithm runs only on 1/2 triangular part. Also notice memory usage is now 60% better than Sklearn. If dense_output is set to FALSE, then a TCSR Matrix (Triangular CSR Matrix) is provided and not a CSR matrix. This has the advantage of using only 1/2n^2 - n memory and not n^2 memory. Old complexity: X @ XT n^2p rowSum(X^2) np XXT*-2 n^2 XXT+X^2 2n^2 maximum(XXT,0) n^2 n^2p + 4n^2 + np New complexity: sym X @ XT n^2p/2 rowSum(X^2) np sym XXT*-2 n^2/2 sym XXT+X^2 n^2 maximum(XXT,0) n^2/2 n^2p/2 + 2n^2 + np So New complexity approx= 1/2(Old complexity) """ if dense_output: XXT = _XXT_sparse(val, colPointer, rowIndices, n, p, n_jobs) XXT = mult_minus2(XXT) S = rowSum_sparse(val, colPointer, rowIndices) XXT += S[:, newaxis] XXT += S #[newaxis,:] XXT = maximum0_parallel(XXT, squared) if n_jobs != 1 else maximum0(XXT, squared) if not triangular: XXT = reflect(XXT, n_jobs) else: XXT = _XXT_triangular(val, colPointer, rowIndices, n, p, n_jobs) S = rowSum_sparse(val, colPointer, rowIndices) XXT = euclidean_triangular_parallel(S, XXT, squared = squared) if n_jobs != 1 else \ euclidean_triangular_single(S, XXT, squared = squared) return XXT