Source code for hyperlearn.big_data.incremental


from numpy import vstack, newaxis, arange
from ..linalg import svd, eigh, eig
from .truncated import truncatedSVD, truncatedEigh
from ..utils import memoryXTX
from .randomized import randomizedSVD, randomizedEig
from ..exceptions import PartialWrongShape


def _utilSVD(batch, S, VT, eig = False):
	"""
	Batch (nrows, ncols)
	S (ncomponents)
	VT (rows = ncomponents, cols = ncols)
	
	Check Batch(ncols) == VT(ncols) to check same number
		of columns or else error is provided.
	"""
	if eig: 
		VT, S = VT.T, S**0.5
	ncomponents, ncols = VT.shape
	if batch.shape[1] != ncols:
		raise PartialWrongShape()

	data = vstack( ( S[:,newaxis]*VT , batch ) )

	return data, VT.shape[0] , memoryXTX(data)



[docs]def partialSVD(batch, S, VT, ratio = 1, solver = 'full', tol = None, max_iter = 'auto'):
	"""
	Fits a partial SVD after given old singular values S
	and old components VT.

	Note that VT will be used as the number of old components,
	so when calling truncated or randomized, will output a
	specific number of eigenvectors and singular values.

	Checks if new batch's size matches that of the old VT.

	Note that PartialSVD has different solvers. Either choose:
		1. full
			Solves full SVD on the data. This is the most
			stable and will guarantee the most robust results.
			You can select the number of components to keep
			within the model later.

		2. truncated
			This keeps the top K right eigenvectors and top
			k right singular values, as determined by
			n_components. Note full SVD is not called for the
			truncated case, but rather ARPACK is called.

		3. randomized
			Same as truncated, but instead of using ARPACK, uses
			randomized SVD.

	Notice how Batch = U @ S @ VT. However, partialSVD returns
	S, VT, and not U. In order to get U, you might consider using
	the relation that X = U @ S @ VT, and approximating U by:

		X = U @ S @ VT
		X @ V = U @ S
		(X @ V)/S = U

		So, U = (X @ V)/S, so you can output U from (X @ V)/S

		You can also get U partially and slowly using reverseU.
	"""
	data, k, __ = _utilSVD(batch, S, VT, eig = False)

	if solver == 'full':
		U, S, VT = svd(data)
	elif solver == 'truncated':
		U, S, VT = truncatedSVD(data, n_components = k, tol = tol)
	else:
		U, S, VT = randomizedSVD(data, n_components = k, max_iter = max_iter)

	return U[k:,:k], S[:k], VT[:k]




[docs]def partialEig(batch, S2, V, ratio = 1, solver = 'full', tol = None, max_iter = 'auto'):
	"""
	Fits a partial Eigendecomp after given old eigenvalues S2
	and old eigenvector components V.

	Note that V will be used as the number of old components,
	so when calling truncated or randomized, will output a
	specific number of eigenvectors and eigenvalues.

	Checks if new batch's size matches that of the old V.

	Note that PartialEig has different solvers. Either choose:
		1. full
			Solves full Eigendecompsition on the data. This is the most
			stable and will guarantee the most robust results.
			You can select the number of components to keep
			within the model later.

		2. truncated
			This keeps the top K right eigenvectors and top
			k eigenvalues, as determined by n_components. Note full Eig
			is not called for the truncated case, but rather ARPACK is called.

		3. randomized
			Same as truncated, but instead of using ARPACK, uses
			randomized Eig.
	"""
	data, k, memCheck = _utilSVD(batch, S2, V, eig = True)

	if solver == 'full':
		S2, V = eig(data, svd = False)
		return S2, V

	elif solver == 'truncated':
		if memCheck:
			S2, V = truncatedEigh(data.T @ data, n_components = k, tol = tol)
		else:
			__, S2, V = truncatedSVD(data, n_components = k, tol = tol)
			S2**=2
			V = V.T
	else:
		S2, V = randomizedEig(data, n_components = k, max_iter = max_iter)

	return S2[:k], V[:,:k]