Source code for hyperlearn.impute.SVDImpute


from numpy import nanmean, nanstd, log1p, isnan, sqrt, nanmin, nan
from ..numba import minimum
from ..linalg import eig
from ..big_data.randomized import randomizedEig
from ..big_data.incremental import partialSVD


[docs]def fit(X, n_components = 'auto', standardise = True, copy = True):
	"""
	[Added 31/10/2018] [Edited 2/11/2018 Fixed SVDImpute]

	Fits a SVD onto the training data by projecting it to a lower space after
	being intially filled with column means. By default, n_components is
	determined automatically using log(p+1). Setting too low or too high mirrors
	mean imputation, and deletes the purpose of SVD imputation.

	Returns:
	1. S 	singular values
	2. VT 	eigenvectors
	+ mean, std, mins
	"""
	n, p = X.shape
	k = int(sqrt(p)-1) if n_components in ('auto', None) else n_components        
	if k <= 0: k = 1
	if k >= p: k = p

	C = X.copy() if copy else X
	mask = isnan(X)

	if standardise:
		mean = nanmean(X, 0)
		std = nanstd(X, 0)
		mins = nanmin(X, 0)
		std[std == 0] = 1
		C -= mean
		C /= std
	else:
		mean, std, mins = None, None, None
	C[mask] = 0

	S, VT = randomizedEig(C, k)
	S **= 0.5
	VT = VT.T

	if copy == False:
		C[mask] = nan
		if standardise:
			C *= std
			C += mean
	return S, VT, mean, std, mins, standardise


[docs]def transform(X, S, VT, mean, std, mins, standardise, copy = True):
	"""
	[Added 31/10/2018] [Edited 2/11/2018 FIxed SVDImpute]

	The fundamental advantage of HyperLearn's SVD imputation is that a .transform
	method is provided. I do not require seeing the whole matrix for imputation,
	and can calculate SVD incrementally via the Incremental Module.
	"""
	n, p = X.shape
	D = X.copy() if copy else X
	mask = isnan(X)

	if standardise:
		D -= mean
		D /= std
	D[mask] = 0
	
	U, S, VT = partialSVD(D, S, VT, solver = 'randomized')
	reconstruction = U * S @ VT
	D[mask] = reconstruction[mask]

	if standardise:
		D *= std
		D += mean
		for j in range(p):
			min_ = mins[j]
			what = D[:,j]
			what[what < min_] = min_

	if copy == False:
		X[mask] = nan
		if standardise:
			X *= std
			X += mean
	return D