Source code for hyperlearn.big_data.incremental


from numpy import vstack, newaxis, arange
from ..linalg import svd, eigh, eig
from .truncated import truncatedSVD, truncatedEigh
from ..utils import memoryXTX
from .randomized import randomizedSVD, randomizedEig
from ..exceptions import PartialWrongShape


def _utilSVD(batch, S, VT, eig = False):
	"""
	Batch (nrows, ncols)
	S (ncomponents)
	VT (rows = ncomponents, cols = ncols)
	
	Check Batch(ncols) == VT(ncols) to check same number
		of columns or else error is provided.
	"""
	if eig: 
		VT, S = VT.T, S**0.5
	ncomponents, ncols = VT.shape
	if batch.shape[1] != ncols:
		raise PartialWrongShape()

	data = vstack( ( S[:,newaxis]*VT , batch ) )

	return data, VT.shape[0] , memoryXTX(data)



[docs]def partialSVD(batch, S, VT, ratio = 1, solver = 'full', tol = None, max_iter = 'auto'): """ Fits a partial SVD after given old singular values S and old components VT. Note that VT will be used as the number of old components, so when calling truncated or randomized, will output a specific number of eigenvectors and singular values. Checks if new batch's size matches that of the old VT. Note that PartialSVD has different solvers. Either choose: 1. full Solves full SVD on the data. This is the most stable and will guarantee the most robust results. You can select the number of components to keep within the model later. 2. truncated This keeps the top K right eigenvectors and top k right singular values, as determined by n_components. Note full SVD is not called for the truncated case, but rather ARPACK is called. 3. randomized Same as truncated, but instead of using ARPACK, uses randomized SVD. Notice how Batch = U @ S @ VT. However, partialSVD returns S, VT, and not U. In order to get U, you might consider using the relation that X = U @ S @ VT, and approximating U by: X = U @ S @ VT X @ V = U @ S (X @ V)/S = U So, U = (X @ V)/S, so you can output U from (X @ V)/S You can also get U partially and slowly using reverseU. """ data, k, __ = _utilSVD(batch, S, VT, eig = False) if solver == 'full': U, S, VT = svd(data) elif solver == 'truncated': U, S, VT = truncatedSVD(data, n_components = k, tol = tol) else: U, S, VT = randomizedSVD(data, n_components = k, max_iter = max_iter) return U[k:,:k], S[:k], VT[:k]
[docs]def partialEig(batch, S2, V, ratio = 1, solver = 'full', tol = None, max_iter = 'auto'): """ Fits a partial Eigendecomp after given old eigenvalues S2 and old eigenvector components V. Note that V will be used as the number of old components, so when calling truncated or randomized, will output a specific number of eigenvectors and eigenvalues. Checks if new batch's size matches that of the old V. Note that PartialEig has different solvers. Either choose: 1. full Solves full Eigendecompsition on the data. This is the most stable and will guarantee the most robust results. You can select the number of components to keep within the model later. 2. truncated This keeps the top K right eigenvectors and top k eigenvalues, as determined by n_components. Note full Eig is not called for the truncated case, but rather ARPACK is called. 3. randomized Same as truncated, but instead of using ARPACK, uses randomized Eig. """ data, k, memCheck = _utilSVD(batch, S2, V, eig = True) if solver == 'full': S2, V = eig(data, svd = False) return S2, V elif solver == 'truncated': if memCheck: S2, V = truncatedEigh(data.T @ data, n_components = k, tol = tol) else: __, S2, V = truncatedSVD(data, n_components = k, tol = tol) S2**=2 V = V.T else: S2, V = randomizedEig(data, n_components = k, max_iter = max_iter) return S2[:k], V[:,:k]