Source code for hyperlearn.metrics.cosine



from ..utils import _XXT, rowSum, reflect, setDiagonal
from numpy import zeros, newaxis
from numba import njit, prange
from ..sparse.csr import div_1 ,mult_1, _XXT as _XXT_sparse, rowSum as rowSum_sparse
from ..sparse.tcsr import _XXT as _XXT_triangular



[docs]def cosine_sim_triangular(N, D): """ [Added 21/10/2018] Quickly performs X / norm_rows / norm_rows.T on the TCSR matrix. """ n = len(N) move = 0 # loop *-2 and adds S[:, newaxis] for i in prange(n-1): i1 = i+1 left = i*i1 // 2 s = N[i1] for j in range(left, left+i1): # div N[:, newaxis] D[j] /= s # loop div N[newaxis, :] for a in prange(n-1): s = N[a] for b in range(a, n-1): # div N[newaxis, :] or N c = b*(b+1) // 2 + a D[c] /= s return D
cosine_sim_triangular_single = njit(cosine_sim_triangular, fastmath = True, nogil = True, cache = True) cosine_sim_triangular_parallel = njit(cosine_sim_triangular, fastmath = True, nogil = True, parallel = True)
[docs]@njit(fastmath = True, nogil = True, cache = True) def cosine_dis(XXT): """ [Added 22/10/2018] Performs XXT*-1 + 1 quickly on the lower triangular part. """ n = len(XXT) for i in range(n): for j in range(i): XXT[i, j] *= -1 XXT[i, j] += 1 return XXT
[docs]@njit(fastmath = True, nogil = True, cache = True) def cosine_dis_triangular(D): """ [Added 22/10/2018] Performs XXT*-1 + 1 quickly on the TCSR. """ D *= -1 D += 1 return D
[docs]def cosine_similarity(X, Y = None, triangular = False, n_jobs = 1, copy = False): """ [Added 20/10/2018] [Edited 22/201/2018] [Edited 22/10/2018 Added Y option] Note: when using Y, speed improvement is approx 5% only from Sklearn. Cosine similarity is approx the same speed as Sklearn, but uses approx 10% less memory. One clear advantage is if you set triangular to TRUE, then it's faster. """ norm_rows = rowSum(X, norm = True) if Y is X: # Force algo to be triangular cosine rather than normal CS. Y = None if Y is None: if copy: XXT = _XXT(X.T) XXT /= norm_rows[:, newaxis] XXT /= norm_rows #[newaxis, :] else: XXT = _XXT( (X/norm_rows[:, newaxis]).T ) if not triangular: XXT = reflect(XXT, n_jobs) # diagonal is set to 1 setDiagonal(XXT, 1) return XXT else: D = X @ Y.T D /= norm_rows[:, newaxis] D /= rowSum(Y, norm = True) return D
[docs]def cosine_similarity_sparse(val, colPointer, rowIndices, n, p, triangular = False, dense_output = True, n_jobs = 1, copy = True): """ [Added 20/10/2018] [Edited 21/10/2018] Slightly faster than Sklearn's Cosine Similarity implementation. If dense_output is set to FALSE, then a TCSR Matrix (Triangular CSR Matrix) is provided and not a CSR matrix. This has the advantage of using only 1/2n^2 - n memory and not n^2 memory. """ norm_rows = rowSum_sparse(val, colPointer, rowIndices, norm = True) if dense_output: if copy: XXT = _XXT_sparse(val, colPointer, rowIndices, n, p, n_jobs) XXT /= norm_rows[:, newaxis] XXT /= norm_rows #[newaxis, :] else: val = div_1(val, colPointer, rowIndices, norm_rows, n, p, copy = False) XXT = _XXT_sparse(val, colPointer, rowIndices, n, p, n_jobs) val = mult_1(val, colPointer, rowIndices, norm_rows, n, p, copy = False) if not triangular: XXT = reflect(XXT, n_jobs) # diagonal is set to 1 setDiagonal(XXT, 1) else: XXT = _XXT_triangular(val, colPointer, rowIndices, n, p, n_jobs) XXT = cosine_triangular_parallel(norm_rows, XXT) if n_jobs != 1 else \ cosine_triangular_single(norm_rows, XXT) return XXT
[docs]def cosine_distances(X, Y = None, triangular = False, n_jobs = 1, copy = False): """ [Added 15/10/2018] [Edited 18/10/2018] [Edited 22/10/2018 Added Y option] Note: when using Y, speed improvement is approx 5-10% only from Sklearn. Slightly faster than Sklearn's Cosine Distances implementation. If you set triangular to TRUE, the result is much much faster. (Approx 50% faster than Sklearn) """ norm_rows = rowSum(X, norm = True) if Y is X: # Force algo to be triangular cosine rather than normal CS. Y = None if Y is None: if copy: XXT = _XXT(X.T) XXT /= norm_rows[:, newaxis] XXT /= norm_rows #[newaxis, :] else: XXT = _XXT( (X/norm_rows[:, newaxis]).T ) # XXT*-1 + 1 XXT = cosine_dis(XXT) if not triangular: XXT = reflect(XXT, n_jobs) # diagonal is set to 0 as zero distance between row i and i setDiagonal(XXT, 0) return XXT else: D = X @ Y.T D /= norm_rows[:, newaxis] D /= rowSum(Y, norm = True) D *= -1 D += 1 return D
[docs]def cosine_distances_sparse(val, colPointer, rowIndices, n, p, triangular = False, dense_output = True, n_jobs = 1, copy = True): """ [Added 22/10/2018] Slightly faster than Sklearn's Cosine Distances implementation. If dense_output is set to FALSE, then a TCSR Matrix (Triangular CSR Matrix) is provided and not a CSR matrix. This has the advantage of using only 1/2n^2 - n memory and not n^2 memory. """ norm_rows = rowSum_sparse(val, colPointer, rowIndices, norm = True) if dense_output: if copy: XXT = _XXT_sparse(val, colPointer, rowIndices, n, p, n_jobs) XXT /= norm_rows[:, newaxis] XXT /= norm_rows #[newaxis, :] else: val = div_1(val, colPointer, rowIndices, norm_rows, n, p, copy = False) XXT = _XXT_sparse(val, colPointer, rowIndices, n, p, n_jobs) val = mult_1(val, colPointer, rowIndices, norm_rows, n, p, copy = False) # XXT*-1 + 1 XXT = cosine_dis(XXT) if not triangular: XXT = reflect(XXT, n_jobs) # diagonal is set to 0 as zero distance between row i and i setDiagonal(XXT, 0) else: XXT = _XXT_triangular(val, colPointer, rowIndices, n, p, n_jobs) # XXT*-1 + 1 XXT = cosine_dis_triangular(XXT) XXT = cosine_triangular_parallel(norm_rows, XXT) if n_jobs != 1 else \ cosine_triangular_single(norm_rows, XXT) return XXT