This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 848cfcc667 [SYSTEMDS-3553] Additional nn optimizers (AdamW, ScaledGD)
848cfcc667 is described below
commit 848cfcc667e2bac9589c8f586574aedf7e3c17a8
Author: ReneEnjilian <[email protected]>
AuthorDate: Sat Feb 1 10:30:58 2025 +0100
[SYSTEMDS-3553] Additional nn optimizers (AdamW, ScaledGD)
DIA WiSe 24/25 project
Closes #2206.
---
scripts/nn/optim/adamw.dml | 97 ++++++++++++++++++++++++++
scripts/nn/optim/scaled_gd.dml | 151 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 248 insertions(+)
diff --git a/scripts/nn/optim/adamw.dml b/scripts/nn/optim/adamw.dml
new file mode 100644
index 0000000000..56938c6664
--- /dev/null
+++ b/scripts/nn/optim/adamw.dml
@@ -0,0 +1,97 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Adam optimizer with weight decay (AdamW)
+ */
+
+update = function(matrix[double] X, matrix[double] dX, double lr, double
beta1, double beta2,
+ double epsilon, double lambda, int t, matrix[double] m, matrix[double] v)
+ return (matrix[double] X, matrix[double] m, matrix[double] v)
+{
+ /*
+ * Performs an AdamW update.
+ *
+ * Reference:
+ * - Decoupled Weight Decay Regularization, Ilya Loshchilov, Frank Hutter.
+ * - https://arxiv.org/abs/1711.05101v3
+ *
+ * Inputs:
+ * - X: Parameters to update, of shape (any, any).
+ * - dX: Gradient wrt `X` of a loss function being optimized, of
+ * same shape as `X`.
+ * - lr: Learning rate. Recommended value is 0.001.
+ * - beta1: Exponential decay rate for the 1st moment estimates.
+ * Recommended value is 0.9.
+ * - beta2: Exponential decay rate for the 2nd moment estimates.
+ * Recommended value is 0.999.
+ * - epsilon: Smoothing term to avoid divide by zero errors.
+ * Recommended value is 1e-8.
+ * - lambda: Weight decay factor that penalizes large weights.
+ * Recommended value is 0.01
+ * - t: Timestep, starting at 0.
+ * - m: State containing the 1st moment (mean) estimate by
+ * maintaining exponential moving averages of the gradients, of
+ * same shape as `X`.
+ * - v: State containing the 2nd raw moment (uncentered variance)
+ * estimate by maintaining exponential moving averages of the
+ * squared gradients, of same shape as `X`.
+ *
+ * Outputs:
+ * - X: Updated parameters `X`, of same shape as input `X`.
+ * - m: Updated state containing the 1st moment (mean) estimate by
+ * maintaining exponential moving averages of the gradients, of
+ * same shape as `X`.
+ * - v: Updated state containing the 2nd raw moment (uncentered
+ * variance) estimate by maintaining exponential moving averages
+ * of the squared gradients, of same shape as `X`.
+ */
+ t = t + 1
+ m = beta1*m + (1-beta1)*dX # update biased 1st moment estimate
+ v = beta2*v + (1-beta2)*dX^2 # update biased 2nd raw moment estimate
+ m_hat = m / (1-beta1^t) # compute bias-corrected 1st moment estimate
+ v_hat = v / (1-beta2^t) # compute bias-corrected 2nd raw moment estimate
+ X = X - lr * (m_hat / (sqrt(v_hat) + epsilon) + lambda * X)
+}
+
+init = function(matrix[double] X)
+ return (matrix[double] m, matrix[double] v)
+{
+ /*
+ * Initialize the state for this optimizer.
+ *
+ * Note: This is just a convenience function, and state
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - X: Parameters to update, of shape (any, any).
+ *
+ * Outputs:
+ * - m: Initial state containing the 1st moment (mean) estimate by
+ * maintaining exponential moving averages of the gradients, of
+ * same shape as `X`.
+ * - v: Initial state containing the 2nd raw moment (uncentered
+ * variance) estimate by maintaining exponential moving averages
+ * of the squared gradients, of same shape as `X`.
+ */
+ m = matrix(0, rows=nrow(X), cols=ncol(X))
+ v = matrix(0, rows=nrow(X), cols=ncol(X))
+}
diff --git a/scripts/nn/optim/scaled_gd.dml b/scripts/nn/optim/scaled_gd.dml
new file mode 100644
index 0000000000..3d6b61d77d
--- /dev/null
+++ b/scripts/nn/optim/scaled_gd.dml
@@ -0,0 +1,151 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * ScaledGD optimizer.
+ */
+
+update = function(matrix[double] X, matrix[double] Y,
+ matrix[double] dX, matrix[double] dY, double lr, int r)
+ return (matrix[double] X_new, matrix[double] Y_new)
+{
+ /*
+ * Performs one iteration of the Scaled Gradient Descent update.
+ *
+ * Reference:
+ * - "Accelerating Ill-Conditioned Low-Rank Matrix Estimation
+ * via Scaled Gradient Descent" (arXiv:2005.08898).
+ *
+ * Typical Steps:
+ * 1) Orthonormal Extension (dimension doubling):
+ * - Extend X and Y to [X, X⊥] and [Y, Y⊥] so each is m×2r and n×2r,
+ * with orthonormal columns.
+ * 2) Gradient Step:
+ * - Subtract lr*dX and lr*dY in the extended (2r) space.
+ * 3) Rank-r Truncation:
+ * - Recompute X_new, Y_new by SVD on the updated matrix
+ * of size m×n (i.e., from (X̂ - lr*Gx̂)*(Ŷ - lr*Gŷ)ᵀ).
+ * - Take only top-r singular values and corresponding vectors.
+ *
+ * Inputs:
+ * - X: Current m×r matrix (factor or parameter).
+ * - Y: Current n×r matrix (factor or parameter).
+ * - dX: Gradient w.r.t. X, same shape as X.
+ * - dY: Gradient w.r.t. Y, same shape as Y.
+ * - lr: Learning rate (scalar).
+ * - r: Target rank for the low-rank approximation.
+ *
+ * Outputs:
+ * - X_new: Updated factor/parameter matrix (m×r).
+ * - Y_new: Updated factor/parameter matrix (n×r).
+ */
+
+ #-----------------------------------------------------------
+ # 1) Orthonormal Extension for X and Y
+ #-----------------------------------------------------------
+ # We will form orthonormal complements for X and Y, each adding r columns.
+ # For simplicity, below we create random matrices and orthonormalize them.
+ # In the future, we might use more advanced approaches (QR-based or
+ # local expansions relevant to the gradient directions).
+ X_rand = rand(rows=nrow(X), cols=r)
+ Y_rand = rand(rows=nrow(Y), cols=r)
+
+ # Orthonormalize X
+ X_ext = cbind(X, X_rand)
+ # QR Decomposition: turn X_ext into an orthonormal basis.
+ # Note: SystemDS's 'qr' returns Q,R as multi-return.
+ [QX, RX] = qr(X_ext)
+ # We'll keep just 2r columns of Q (since Q might have dimension m×m)
+ X_hat = QX[, 1:(2*r)]
+
+ # Orthonormalize Y
+ Y_ext = cbind(Y, Y_rand)
+ [QY, RY] = qr(Y_ext)
+ Y_hat = QY[, 1:(2*r)]
+
+ #-----------------------------------------------------------
+ # 2) Gradient Step in Expanded Space
+ #-----------------------------------------------------------
+ # Similarly, we need the gradients w.r.t X_hat, Y_hat. If 'dX' and 'dY'
+ # are for the original X, Y, a simple approach is to "expand" them
+ # by zero-padding for the extra columns.
+ dX_ext = cbind(dX, matrix(0, rows=nrow(X), cols=r))
+ dY_ext = cbind(dY, matrix(0, rows=nrow(Y), cols=r))
+
+ # Update step: X_hat_temp = X_hat - lr * dX_ext, etc.
+ X_hat_temp = X_hat - (lr * dX_ext)
+ Y_hat_temp = Y_hat - (lr * dY_ext)
+
+ #-----------------------------------------------------------
+ # 3) Rank-r Truncation via SVD
+ #-----------------------------------------------------------
+ # Construct a temporary matrix M_temp = X_hat_temp * (Y_hat_temp)ᵀ
+ M_temp = X_hat_temp %*% t(Y_hat_temp)
+
+ # SVD returns multiple outputs: U, S, and V
+ [U, S, V] = svd(M_temp)
+
+ # We will keep only the top-r singular values
+ # Note: S is a diagonal matrix. We can slice it or build from the diag
vector.
+ S_diag = diag(S)
+ s_top = S_diag[1:r]
+ U_top = U[, 1:r]
+ V_top = V[, 1:r]
+
+ # Reconstruct X, Y from the rank-r approximation:
+ # M_temp ≈ U_top * diag(s_top) * V_topᵀ
+ # Often we store X_new = U_top * sqrt(diag(s_top)), Y_new = V_top *
sqrt(diag(s_top))
+ sqrt_s_top = sqrt(s_top)
+ X_new = U_top %*% diag(sqrt_s_top)
+ Y_new = V_top %*% diag(sqrt_s_top)
+}
+
+init = function(matrix[double] X, matrix[double] Y)
+ return (int r)
+{
+ /*
+ * Here, we treat the number of columns (r) of X and Y
+ * as the "rank parameter" for ScaledGD.
+ * This parameter r is an upper bound on the actual
+ * algebraic rank, because some columns may become
+ * linearly dependent.
+ *
+ * Note: This is just a convenience function, and rank
+ * may be initialized manually if needed.
+
+ * Inputs:
+ * - X: Current m×r matrix (factor or parameter).
+ * - Y: Current n×r matrix (factor or parameter).
+ *
+ * Outputs:
+ * - r: upper bound for rank
+ *
+ *
+ */
+
+ if (ncol(X) != ncol(Y)) {
+ stop("X and Y must have the same number of columns in ScaledGD init.")
+ }
+
+ # The rank parameter (upper bound) is simply the number of columns in X
+ r = ncol(X)
+}
+