diff --git a/docs/src/math.md b/docs/src/math.md index 8fd5e86..b2d0665 100644 --- a/docs/src/math.md +++ b/docs/src/math.md @@ -133,3 +133,37 @@ We make use of a few key observations: - Projection on $\mathcal{Z}$ commutes with scaling - Projection on an interval commutes with scaling if scaling is also applied to the interval in question + +### Pock-Chambolle scaling + +Following Lemma 2 of: + +> Pock, Thomas, and Antonin Chambolle. "Diagonal preconditioning for first order primal-dual algorithms in convex optimization." *2011 International Conference on Computer Vision*. IEEE, 2011. [doi:10.1109/ICCV.2011.6126441](https://doi.org/10.1109/ICCV.2011.6126441) + +for matrix $K$ of size $m \times n$ and $\alpha \in [0, 2]$, the diagonal factors are + +```math +\tau_j = \left(\sum_i |K_{ij}|^{\alpha}\right)^{-1} \quad \text{(variable side, column $j$)} +``` + +```math +\sigma_i = \left(\sum_j |K_{ij}|^{2-\alpha}\right)^{-1} \quad \text{(constraint side, row $i$)} +``` + +so that $\|\Sigma^{1/2} K T^{1/2}\| \leq 1$ with $T = \mathrm{diag}(\tau_j), \Sigma = \mathrm{diag}(\sigma_i)$. CoolPDLP absorbs the $1/2$ powers into the rescaling factors: $D_2[j] = \tau_j^{1/2}$ and $D_1[i] = \sigma_i^{1/2}$. The exponent parameter $\alpha$ is exposed as `chambolle_pock_alpha`. See also [`CoolPDLP.chambolle_pock_preconditioner`](@ref). + +Note that CoolPDLP uses $2-\alpha$ for the row and $\alpha$ for the column, +to be aligned with the [PDLP paper](https://dl.acm.org/doi/abs/10.5555/3540261.3541809). This is the opposite of the +convention used in the original Pock-Chambolle paper referenced above. + +### Ruiz equilibration + +The Ruiz iteration alternately rescales each row and each column of $A$ by the square root of its $\ell_\infty$ norm. After enough iterations the rescaled matrix has all row and column $\infty$-norms close to 1. See also [`CoolPDLP.ruiz_preconditioner`](@ref). + +### Composition + +The two passes run one after the other: first Ruiz produces $D_1^{\mathrm{ruiz}}, D_2^{\mathrm{ruiz}}$, then Pock-Chambolle is applied to the already-Ruiz-rescaled matrix $D_1^{\mathrm{ruiz}} A D_2^{\mathrm{ruiz}}$ and produces $D_1^{\mathrm{cp}}, D_2^{\mathrm{cp}}$. The final scaling is then + +```math +D_1 = D_1^{\mathrm{cp}} D_1^{\mathrm{ruiz}} \qquad D_2 = D_2^{\mathrm{ruiz}} D_2^{\mathrm{cp}} +``` diff --git a/src/components/preconditioning.jl b/src/components/preconditioning.jl index 5d81ff7..bf97876 100644 --- a/src/components/preconditioning.jl +++ b/src/components/preconditioning.jl @@ -110,10 +110,42 @@ function diagonal_norm_preconditioner( return Preconditioner(Diagonal(d1), Diagonal(d2)) end -function chambolle_pock_preconditioner(cons::ConstraintMatrix; alpha::Number) - return diagonal_norm_preconditioner(cons; p_row = 2 - alpha, p_col = alpha) +""" + chambolle_pock_preconditioner(cons; alpha) + +Diagonal preconditioner from Lemma 2 of: + +> Pock, Thomas, and Antonin Chambolle. "Diagonal preconditioning for first order +> primal-dual algorithms in convex optimization." 2011 International Conference +> on Computer Vision. IEEE, 2011. doi:10.1109/ICCV.2011.6126441 + +For matrix `K` of size m×n and `α ∈ [0, 2]`, the variable-side factor for +column `j` is `(sum_i |K[i,j]|^α)^{-1/2}` and the constraint-side factor for +row `i` is `(sum_j |K[i,j]|^(2-α))^{-1/2}`. With these, `‖Σ^{1/2} K T^{1/2}‖ ≤ 1` +where `T, Σ` are the diagonals built from those factors. + +Note that CoolPDLP uses `2-α` for the row and `α` for the column, to be aligned with +the 2021 paper "Practical large-scale linear programming using primal-dual hybrid gradient" +of Applegate et al. - this is the opposite of the convention used in +the original Pock-Chambolle paper referenced above. +""" +function chambolle_pock_preconditioner(cons::ConstraintMatrix{T}; alpha::Number) where {T} + (; A, At) = cons + α_row = T(2 - alpha) + α_col = T(alpha) + col_sums = map(j -> column_power_sum(A, j, α_col), axes(A, 2)) + row_sums = map(i -> column_power_sum(At, i, α_row), axes(A, 1)) + d1 = map(s -> iszero(s) ? one(T) : inv(sqrt(s)), row_sums) + d2 = map(s -> iszero(s) ? one(T) : inv(sqrt(s)), col_sums) + return Preconditioner(Diagonal(d1), Diagonal(d2)) end +""" + ruiz_preconditioner(cons; iterations) + +Ruiz equilibration: for `iterations` rounds, rescale each row and column of `A` +(inside `cons`) by the square root of its inf norm. +""" function ruiz_preconditioner(cons::ConstraintMatrix; iterations::Integer) prec = identity_preconditioner(cons) for _ in 1:iterations diff --git a/src/utils/linalg.jl b/src/utils/linalg.jl index 1e5ff5e..7bbb3e7 100644 --- a/src/utils/linalg.jl +++ b/src/utils/linalg.jl @@ -107,6 +107,19 @@ end column_norm(A::AbstractMatrix, j::Integer, p) = norm(view(A, :, j), p) column_norm(A::SparseMatrixCSC, j::Integer, p) = norm(view(nonzeros(A), nzrange(A, j)), p) +""" + column_power_sum(A, j, p) + +Return `sum_i |A[i, j]|^p`. Zero entries are treated as contributing `0` (not `0^0 = 1`), +so the sparse and dense cases are aligned. +""" +@inline _power_term(x, p) = iszero(x) ? zero(x) : abs(x)^p + +column_power_sum(A::AbstractMatrix, j::Integer, p) = + sum(x -> _power_term(x, p), view(A, :, j); init = zero(eltype(A))) +column_power_sum(A::SparseMatrixCSC, j::Integer, p) = + sum(x -> _power_term(x, p), view(nonzeros(A), nzrange(A, j)); init = zero(eltype(A))) + mynnz(A::AbstractSparseMatrix) = nnz(A) mynnz(A::AbstractMatrix) = prod(size(A)) diff --git a/test/components/preconditioning.jl b/test/components/preconditioning.jl index a047da4..3b155e8 100644 --- a/test/components/preconditioning.jl +++ b/test/components/preconditioning.jl @@ -73,3 +73,29 @@ end @test CoolPDLP.clamp.(sol.x, milp.lv, milp.uv) ≈ prec.D2 * CoolPDLP.clamp.(sol_p.x, milp_p.lv, milp_p.uv) @test CoolPDLP.clamp.(milp.A * sol.x, milp.lc, milp.uc) ≈ prec.D1 \ CoolPDLP.clamp.(milp_p.A * sol_p.x, milp_p.lc, milp_p.uc) end + +@testset "Pock-Chambolle exponents" begin + rng = Xoshiro(7) + A = sprand(rng, 10, 20, 0.5) + cons = CoolPDLP.ConstraintMatrix(A, sparse(transpose(A))) + @testset "α = $alpha" for alpha in (0.0, 0.5, 1.0, 1.5, 2.0) + prec = CoolPDLP.chambolle_pock_preconditioner(cons; alpha = alpha) + At_csc = sparse(transpose(A)) + for j in axes(A, 2) + ref = sum( + x -> iszero(x) ? zero(x) : abs(x)^(alpha), + nonzeros(A)[nzrange(A, j)]; init = 0.0 + ) + expected = iszero(ref) ? 1.0 : inv(sqrt(ref)) + @test prec.D2.diag[j] ≈ expected + end + for i in axes(A, 1) + ref = sum( + x -> iszero(x) ? zero(x) : abs(x)^(2 - alpha), + nonzeros(At_csc)[nzrange(At_csc, i)]; init = 0.0 + ) + expected = iszero(ref) ? 1.0 : inv(sqrt(ref)) + @test prec.D1.diag[i] ≈ expected + end + end +end