Source code for hyperlearn.big_data.lsmr


from ..numba import norm, _min, _max, _sign
from numpy import infty, zeros, float32, float64
from copy import copy
from ..utils import _float


[docs]def Orthogonalize(a, b): A,B,_a,_b = abs(a), abs(b), _sign(a), _sign(b) if b == 0: return _a, 0, A elif a == 0: return 0, _b, B elif B > A: tau = a/b s = _b/(1+tau*tau)**0.5 c = s*tau r = b/s else: tau = b/a c = _a/(1+tau*tau)**0.5 s = c*tau r = a/c return c,s,r
[docs]def floatType(dtype): dtype = str(dtype) if '64' in dtype: return float64 return float32
[docs]def lsmr(X, y, tol = 1e-6, condition_limit = 1e8, alpha = 0, threshold = 1e12, non_negative = False, max_iter = None): """ [As of 12/9/2018, an optional non_negative argument is added. Note as accurate as Scipy's NNLS, by copies ideas from gradient descent.] Implements extremely fast least squares LSMR using orthogonalization as seen in Scipy's LSMR and https://arxiv.org/abs/1006.0758 [LSMR: An iterative algorithm for sparse least-squares problems] by David Fong, Michael Saunders. Scipy's version of LSMR is surprisingly slow, as some slow design factors were used (ie np.sqrt(1 number) is slower than number**0.5, or min(a,b) is slower than using 1 if statement.) ALPHA is provided for regularization purposes like Ridge Regression. This algorithm works well for Sparse Matrices as well, and the time complexity analysis is approx: X.T @ y * min(n,p) times + 3 or so O(n) operations ==> O(np)*min(n,p) ==> either min(O(n^2p + n), O(np^2 + n)) This complexity is much better than Cholesky Solve which is the next fastest in HyperLearn. Cholesky requires O(np^2) for XT * X, then Cholesky needs an extra 1/3*O(np^2), then inversion takes another 1/3*(np^2), and finally (XT*y) needs O(np). So Cholesky needs O(5/3np^2 + np) >> min(O(n^2p + n), O(np^2 + n)) So by factor analysis, expect LSMR to be approx 2 times faster or so. Interestingly, the Space Complexity is even more staggering. LSMR takes only maximum O(np^2) space for the computation of XT * y + some overhead. Cholesky requires XT * X space, which is already max O(n^2p) [which is huge]. Essentially, Cholesky shines when P is large, but N is small. LSMR is good for large N, medium P """ damp = alpha check = False try: X = X.A # Convert Matrix to Array X = _float(X) except: pass dtype = X.dtype Y = y.ravel().copy() if y.dtype != dtype: Y = Y.astype(dtype) # Y = Y.copy() n,p = X.shape max_iter = _min(n, p) if max_iter is None else max_iter norm_Y = norm(Y) zero = zeros(p, dtype = dtype) theta_hat = zero.copy() beta = copy(norm_Y) XT = X.T if beta > 0: Y /= beta V = XT @ Y alpha = norm(V) else: V = zeros(p, dtype = dtype) alpha = 0 if alpha > 0: V /= alpha # Initialize first iteration variables zeta_bar = alpha * beta alpha_bar = alpha rho, rho_bar, c_bar, s_bar = 1,1,1,0 H = V.copy() H_bar = zero.copy() # Initialize variables for estimation of ||r||. beta_dd = beta beta_d, rho_d_old, tau_tilde_old = 0,1,0 theta_tilde, zeta, d = 0,0,0 # Initialize variables for estimation of ||A|| and cond(A) norm_X2 = alpha*alpha max_r_bar, min_r_bar = 0, 1e100 norm_X, cond_X, norm_theta = alpha,1,0 # Early stopping if condition_limit > 0: c_tol = 1/condition_limit else: c_tol = 0 norm_r = beta # Check if theta_hat == 0 normAB = alpha*beta if normAB == 0: return theta_hat, True # Iteration Loop for i in range(max_iter): Y *= -alpha Y += X @ V beta = norm(Y) if beta > 0: Y /= beta V *= -beta V += XT @ Y alpha = norm(V) if alpha > 0: V /= alpha c_hat, s_hat, alpha_hat = Orthogonalize(alpha_bar, damp) # Plane rotation rho_old = rho c, s, rho = Orthogonalize(alpha_hat, beta) theta_new = s*alpha alpha_bar = c*alpha # Plane rotation rho_bar_old, zeta_old = rho_bar, zeta theta_bar, rho_temp = s_bar*rho, c_bar*rho c_bar, s_bar, rho_bar = Orthogonalize(c_bar*rho, theta_new) zeta = c_bar*zeta_bar zeta_bar *= -s_bar # Update H, H_hat, theta_hat H_bar *= -theta_bar*rho/(rho_old*rho_bar_old) H_bar += H theta_hat += (zeta/(rho*rho_bar))*H_bar # Just in case if coefficients exceed maximum theta_hat[abs(theta_hat) > threshold] = 0.0 if non_negative: theta_hat[theta_hat < 0] = 0.0 H *= -theta_new/rho H += V # Estimate of ||r|| beta_acute, beta_check = c_hat*beta_dd, -s_hat*beta_dd # Apply rotation beta_hat = c*beta_acute beta_dd = -s*beta_acute # Apply rotation theta_tilde_old = theta_tilde c_tilde_old, s_tilde_old, rho_tilde_old = Orthogonalize(rho_old, theta_bar) theta_tilde = s_tilde_old*rho_bar rho_old = c_tilde_old*rho_bar beta_d = c_tilde_old*beta_hat - s_tilde_old*beta_d tau_tilde_old = (zeta_old - theta_tilde_old*tau_tilde_old)/rho_tilde_old d += beta_check*beta_check partial = beta_d - ((zeta - theta_tilde*tau_tilde_old)/rho_old) norm_r = (d+ partial*partial + beta_dd*beta_dd)**0.5 # Estimate ||X|| norm_X2 += beta*beta norm_X = norm_X2**0.5 norm_X2 += alpha*alpha # Condition(X) max_r_bar = _max(max_r_bar, rho_bar_old) if i > 1: min_r_bar = _min(max_r_bar, rho_bar_old) cond_X = _max(max_r_bar, rho_temp)/_min(min_r_bar, rho_temp) # Estimate other stuff test_1 = norm_r/norm_Y X_r = norm_X * norm_r if X_r != 0: test_2 = abs(zeta_bar) / X_r # norm_ar = abs(zeta_bar) else: test_2 = infty if (1+test_2 <= 1): break elif (test_2 <= tol): break test_3 = 1/cond_X check = (1+test_3 <= 1) if check: break elif (test_3 <= c_tol): break partial = 1+(norm_X*norm(theta_hat)/norm_Y) # norm_theta = norm(theta) t1 = test_1/partial r_tol = tol*partial if (1+t1 <= 1): break elif (test_1 <= r_tol): break return theta_hat, not check
# check is TRUE is good, FALSE is cond(X) is too large.