_region_.tex

\message{ !name(notebook.tex)}\documentclass[8pt]{report}

%% \usepackage[fleqn]{amsmath}
\usepackage[margin=1in]{geometry}
\usepackage{amsmath,amsfonts,amsthm,bm}
\usepackage{breqn}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{tikz}
\usepackage[ruled,vlined,linesnumbered,lined,boxed,commentsnumbered]{algorithm2e}
\usepackage{siunitx}
\usepackage{graphicx}
\usepackage{subcaption}
%% \usepackage{datetime}
\usepackage{multirow}
\usepackage{multicol}
\usepackage{mathrsfs}
\usepackage{fancyhdr}
\usepackage{fancyvrb}
\usepackage{parskip} %turns off paragraph indent
\pagestyle{fancy}

\usetikzlibrary{arrows}

\DeclareMathOperator*{\argmin}{argmin}
\newcommand*{\argminl}{\argmin\limits}

\DeclareMathOperator*{\argmax}{argmax}
\newcommand*{\argmaxl}{\argmax\limits}

\newcommand{\mathleft}{\@fleqntrue\@mathmargin0pt}
\newcommand{\R}{\mathbb{R}}
\newcommand{\Z}{\mathbb{Z}} 
\newcommand{\N}{\mathbb{N}}
\newcommand{\norm}[1]{\|#1\|}
\newcommand{\ppartial}[2]{\frac{\partial #1}{\partial #2}}
\newcommand{\set}[1]{\{#1\}}

\setcounter{MaxMatrixCols}{20}

% remove excess vertical space for align* equations
\setlength{\abovedisplayskip}{0pt}
\setlength{\belowdisplayskip}{0pt}
\setlength{\abovedisplayshortskip}{0pt}
\setlength{\belowdisplayshortskip}{0pt}

\usepackage{multicol}

\usepackage[tiny]{titlesec}

\SetKwRepeat{Do}{do}{while}
\SetKwFor{While}{while}{}{end while}%
\begin{document}

\message{ !name(notebook.tex) !offset(-3) }


\lhead{Notebook - Convex Optimization}
\rhead{Bill (Yuan) Liu}

\begin{multicols*}{2}

  \section{Symbols}

  $S^n \equiv$ set of all symmetric matrices\\
  
  $M_+^n \equiv$ set of all positive definite matrices (PD)\\
  
  $S_+^n \equiv$ set of all symmetric positive semidefinite matrices (SPSD)\\
  
  $S_{++}^n \equiv$ set of all symmetric positive definite matrices (SPD)\\
  
  $cl(X) \equiv$ closure of $X$\\

  $relint(X) \equiv$ relative interior of $X$\\
  
  $int(X) \equiv$ interior of $X$\\

  $[x]^+ \equiv max(x,0)$\\

  $I_C(x) = \begin{cases}
      0 &, x \in C\\
      \infty &, x \not\in C
    \end{cases}$\\
   
  $prox_{f,\lambda}(y) = f(x)+\frac{1}{2 \lambda}\norm{x-v}_2^2$\\
    
  $\text{SoftThresholding}_{\lambda}(y) = prox_{\norm{*}_1,\lambda}(y)$

  \vfill\null
  \columnbreak
  \vfill\null
  \columnbreak
  
  \section{Preliminary}
  
  Consider $f:\R^n \to \R$\\
  Gradient of $f$: $\nabla f(x) = \begin{bmatrix} \partial f / \partial x_i \\ .. \end{bmatrix}$\\
  $f(x) = a^Tx \implies \nabla f(x) = a$\\
  $f(x) = x^TPx, P=P^T \implies \nabla f(x) = 2Px$\\
  $f(x) = x^TPx \implies \nabla f(x) = 2(\frac{P^T+P}{2})x=(P^T+P)x$\\

  Taylor expansion approximation:\\
  $f(x) \approx f(x_0) + \nabla^T f(x_0)(x-x_0) + o((x-x_0)^2)$\\
  $f(x+\delta x) \approx f(x_0) + \nabla^T f(x)\delta x + o((\delta x)^2)$\\
  
  Chain rule:\\
  $f: \R \to \R, g: \R \to \R, h(x) = f(g(x))$\\
  $\nabla h(x) = g'(f(x))  \nabla f(x)$\\

  $g:\R^m \to \R, g(x) = f(Ax+b)$\\
  $\nabla g(x) = A^T \nabla f(Ax+b)$\\

  2nd derivative:\\
  $\nabla^2 f(x)=\begin{bmatrix}
    \partial^2 f / \partial x_1 \partial x_1 & ...\\
    .. & \partial^2 f / \partial x_n \partial x_n
  \end{bmatrix}$\\
  $\nabla f(x)=Px+g$\\
  $\nabla^2 f(x)=P$\\

  Hessian gives the 2nd order approximation:\\
  $f(x) \approx f(x_0) + \nabla^T f(x_0)(x-x_0) + \\ \frac{1}{2}(x-x_0)^T \nabla^2 f(x_0) (x-x_0)$\\

  Matrices:\\
  $A \in \R^{m \times n}$: set of all real matrices\\
  inner product: $\sum_i \sum_j x_{ij} y_{ij} = trace(XY^T)=trace(Y^TX)=\sum_{i}(XY)_{ii}$\\
  note trace has cyclic property\\
  frobenius norm: $\|X\|_F  = (\sum_i \sum_j X_{ij}^2)^{\frac{1}{2}}$\\
  range: $R(A) = \{Ax: x \in \R^n\}=\sum_i a_i x_i$, where $a_i$ is ith column (column space of A)\\
  null space: $N(A) = \{ x : Ax = 0\}$\\

  SVD:\\
  $A_{m \times n} = U_{m \times m} \Sigma_{m \times n} V_{n \times n}^T$\\
  U and V are left and right eigenvector matrixes\\
  U and V are orthogonal matrixes($BB^T=B^TB=I$)\\
  $\Sigma$ is rectangular diagonal matrix of eigenvalues\\
  $A_{m \times n}x_{n}$\\
  linear transformation: $U \Sigma V^T x$\\
  rotation - scaling - rotation
  \vfill\null
  \columnbreak
  
  PSD matrix:\\
  $A\ PSD \iff (\forall x) x^TAx \geq 0 \iff (\forall i) \lambda_i(A) \geq 0$\\
  $A\ PSD \implies A^{1/2}$ exists\\

  Real symmetric matrices have real eigenvalues:
  \begin{align*}
    &Av=\lambda v\\ 
    &v^*Av=v^*\lambda v=\lambda \|v\|_2^2\\
    &(v^* A v)^*= v^* A^* v = \lambda^* \|v\|_2^2 \implies \lambda = \lambda^*
  \end{align*}

  Affine sets\\
  A set $C \subseteq \R^n$ is affine if $(\forall x_1, x_2 \in C)(\forall \theta \in \R) \implies \theta x_1 +(1-\theta) x_2 \in C$\\
  
  Convex sets\\
  A set $C \subseteq \R^n$ is convex if $(\forall x_1,x_2 \in C)(\forall \theta \in \R) 0 \leq \theta \leq 1 \implies \theta x_1 +(1-\theta)x_2 \in C$\\

  Operations preserving convex sets:
  \begin{itemize}
  \item partial sum
  \item sum
  \item coordinate projection
  \item scaling
  \item translation
  \item intersection between any convex sets
  \end{itemize}  

  Separating Hyperplanes: if $S,T \subset \R^n$ are convex and disjoint, then $\exists a \neq 0, b$ such that:\\
  \begin{align*}
    a^Tx \geq b, \forall x \in S\\
    a^Tx \leq b, \forall x \in T\\
  \end{align*}

  Supporting Hyperplane:\\
  if $S$ is convex, $\forall x_0 \in \partial S$ (boundary of S), then $\exists a \neq 0$ such that $a^Tx \leq a^Tx_0, \forall x \in S$\\
  
  Convex combination:\\
  $\sum_i \theta_i x_i, \forall \theta_i \in \R, \sum_i \theta_i = 1, \theta_i \geq 0$\\
  
  Convex hull:\\
  The set of all convex combinations of points in $C$, the hull is convex\\

  Hyperplane
  \begin{align*}
    C=\{x: a^Tx=b\}, a \in \R^n, a \neq 0, b \in \R
  \end{align*}
  Halfspaces
  \begin{align*}
    C&=\{x: a^Tx \leq b\}, a \in \R^n, a \neq 0, b \in \R\\
     &\text{let } a^Tx_c=b\\
    C&=\{x: a^T(x-x_c) \leq 0\}, a \in \R^n, a \neq 0
  \end{align*}
  
  Elipse
  \begin{align*}
    E(x_c,P) = \{ x: (x-x_c)^T P^{-1} (x-x_c) \leq 1 \}, P > 0\\
    P=r^2 I \implies \text{ Euclidean Ball }\\
    P=Q \begin{bmatrix}
      \lambda_1 & ..\\
      .. & \lambda_n\\
    \end{bmatrix}
    Q^T\\
    (x-x_c)^T (Q \begin{bmatrix}
      \lambda_1 & ..\\
      .. & \lambda_n\\
    \end{bmatrix}
    Q^T)^{-1}(x-x_c) \leq 1\\
    \tilde{x}^T \begin{bmatrix}
      \frac{1}{\lambda_1} & ..\\
      .. & \frac{1}{\lambda_n}\\
    \end{bmatrix}
    \tilde{x} \leq 1\\
    \tilde{x}^T \begin{bmatrix}
      \frac{1}{\lambda_1} & ..\\
      .. & \frac{1}{\lambda_n}\\
    \end{bmatrix}
    \tilde{x} = \frac{\tilde{x_1}^2}{\lambda_1}+..+\frac{\tilde{x_n}^2}{\lambda_n}\leq 1\\
    \text{volum of elipsoid proportional to } \sqrt{det(P)}=\sqrt{\Pi_i \lambda_i}
  \end{align*}
  
  \vfill\null
  \columnbreak
  \vfill\null
  \columnbreak
  
  \section{Problem Types}
  \subsection*{LP}
  standard, inequality, general forms
  \begin{align*}
    \min_x c^Tx\ s.t.:\\
         Ax = b\\
         x \succeq 0
  \end{align*}

  \begin{align*}
    \min_x c^Tx\ s.t.:\\
    Ax \preceq b
  \end{align*}

  \begin{align*}
    \min_x c^Tx+d\ s.t.:\\
    Gx \preceq h\\
    Ax = b
  \end{align*}
  
  \subsection*{QP}
  \begin{align*}
    \min_x \frac{1}{2} x^T P x + q^T x + r\ s.t.:\\
    Gx \leq h\\
    Ax = b
  \end{align*}
  \subsection*{QCQP}
  \begin{align*}
    \min_x \frac{1}{2}x^TP_0x + q_0^Tx + r_0, s.t.:\\
    \frac{1}{2}x^TP_ix + q_i^Tx + r_i \leq 0, \forall i\\
    Ax=b
  \end{align*}
  \subsection*{SOCP}
  \begin{align*}
    \min_x  f^T x\ s.t.:\\
    \norm{A_ix+b_i}_2 \leq c_i^Tx+d_i, \forall i\\
    Fx = g
  \end{align*}
  \begin{align*}
    (\forall i) b_i=0 \implies LP\\
    (\forall i) c_i=0 \implies QCQP
  \end{align*}
  \vfill\null
  \columnbreak
  \subsection*{GP}
  \begin{align*}
    \min_x  f_0(x)\ s.t.:\\
    f_i(x) \leq 1, \forall i\\
    h_i(x) = 1, \forall i\\
    f_i\ is\ a\ posynomial := \sum_i h_i\\
    h_i\ is\ a\ monomial := cx_1^{a_1}x_2^{a_2}.., c>0, a_i \in \R\\
  \end{align*}
  Use transform of objective and constraint functions:\\
  $y_i=log x_i, x_i=e^{y_i}$\\
  $\tilde{h_i}$ becomes exponential of affine function\\
  $\tilde{f_i} = log(f_i)$ becomes log sum exp (convex)\\
  If all constraints and objective are monomials, reduces to LP after transform.
  \subsection*{SDP}
  general, standard, inequality forms
  \begin{align*}
    \min_x\ c^Tx\ s.t.:\\
    LMI:\ \sum_i^n x_i F_i + G \preceq_K 0\\
    Ax=b\\
    x\in\R^n\\
    F_i,G \in S^m, K\in S_+^m
  \end{align*}
  \begin{align*}
    \min_X\ tr(CX) s.t.:\\
    tr(A_iX)=b_i, \forall i\\
    X \succeq 0
  \end{align*}
  \begin{align*}
    \min_x\ c^Tx\ s.t.:\\
    \sum_i^n x_i A_i \preceq_K B\\
    Ax=b\\
    B,A_i \in S^m, K\in S_+^m
  \end{align*}
  concatenating constraints:
  \begin{align*}
    F^{(i)}(x) = \sum_j x_j F_i^{(i)} + G^{(i)} \preceq 0\\
    Gx \preceq h\\
    \implies\\
    diag(Gx-h, F^{(1)}(x),..,F^{(m)}(x)) \preceq 0\\
  \end{align*}  
  if all matrices are diagonal, reduces to LP
  \pagebreak

  \section{Convex/Concave Functions}

  \begin{itemize}
  \item Affine
  \item Pointwise supremum of convex functions
    \begin{itemize}
    \item distance to farthest point in arbitrary set
    \item support function of set
    \end{itemize}
  \item Partial minimization:\\
    $g(x,y)\ convex\ in\ x,y, C\ convex \implies\\ min_{y\in C}\ g(x,y)\ convex$
  \item shortest distance to a convex set
  \item Any type of norm
  \item Non-negative weighted sum of convex functions
  \item indicator function $I_C(x)$
  \end{itemize}
  
  \subsection{log det X, concave}
  \begin{align*}
    let\ X=Z+tV \succ 0\\
    f=log det(Z+tV)\\
    f=log det(Z^{-0.5}(I+tZ^{-0.5}VZ^{0.5})Z^{0.5})\\
    f=log (det(Z^{-0.5})det(I+tZ^{-0.5}VZ^{0.5})det(Z^{0.5}))\\
    f=log (det(Z^{0})det(I+tZ^{-0.5}VZ^{0.5}))\\
    f=log det(I+tZ^{-0.5}VZ^{0.5})\\
    f=log \Pi_i (1+\lambda_i t)\\
    f=\sum_i log(1+\lambda_i t)\\
    \frac{\partial f}{\partial t} = \sum_i \frac{\lambda_i}{1+\lambda_i t}\\
    \frac{\partial^2 f}{\partial t^2} = \sum_i \frac{-\lambda_i^2}{(1+\lambda_i t)^2} = -\sum_i \frac{\lambda_i^2}{(1+\lambda_i t)^2} \leq 0\\
    \nabla^2 f \leq 0 \iff f\ concave\\
  \end{align*}

  \vfill\null
  \columnbreak
    
  \subsection{log $\sum_i exp(x_i)$, convex}
  \begin{align*}
    \nabla^2 f = \frac{1}{(1^Tz)^2} (1^Tz diag(z) -zz^T)\\
    v^T zz^T v = det(v^T zz^T v) = det(vv^T zz^T)\\
    v^T zz^T v = \sum_j \sum_i z_j z_i v_j v_i\\
    v^T zz^T v = (\sum_j z_j z_j)(\sum_i z_i v_i)\\
    v^T zz^T v = (\sum_i z_i v_i)^2\\
    use\ Holder's\ Inequality:\\
    \|a\|_2^2 \|b\|_2^2 \geq |a^Tb|^2\\
    let\ a = z_i^{0.5}, b=v_iz_i^{0.5}\\
    1^Tz(\sum_i v_i^2 z_i)-(\sum_i z_i v_i)^2 \geq 0\\
    v^T\nabla^2f v = \frac{1}{(1^Tz)^2} \bigg(1^Tz(\sum_i v_i^2 z_i)-(\sum_i z_i v_i)^2\bigg) \geq 0\\
    \nabla^2f \geq 0 \iff f\ convex
  \end{align*}  
  
  \subsection{geometric mean on $R_{++}^n$, concave}
  \begin{align*}
    f=(\Pi_i x_i)^{\frac{1}{n}}\\
    \frac{\partial}{\partial x_i} f = \frac{1}{n}(\Pi_i x_i)^{\frac{1}{n}-1}\Pi_{j\not=i}x_j\\
    \frac{\partial^2}{\partial x_i^2} f = \frac{1}{n}(\frac{1}{n}-1)(\Pi_i x_i)^{\frac{1}{n}-2}(\Pi_{j\not=i}x_j)^2\\
    \frac{\partial^2}{\partial x_i^2} f = \frac{1}{n}(\frac{1}{n}-1)\frac{(\Pi_i x_i)^{\frac{1}{n}}}{x_i^2}\\
    \frac{\partial^2}{\partial x_i x_k} f = \frac{1}{n^2}\frac{(\Pi_i x_i)^{\frac{1}{n}}}{x_i x_k}, i\not=k\\
    \frac{\partial^2}{\partial x_i x_k} f = \frac{1}{n^2}\frac{(\Pi_i x_i)^{\frac{1}{n}}}{x_i x_k} -\delta_{ik} \frac{1}{n}\frac{(\Pi_i x_i)^{\frac{1}{n}}}{x_i^2}
    \\
    v^T \nabla^2 f v = \frac{-(\Pi_i x_i)^{\frac{1}{n}}}{n^2}(n \sum_i \frac{v_i^2}{x_i^2} - (\sum_i \frac{v_i}{x_i})^2)\\\
    apply\ Cauchy\ Schwartz\ Inequality:\\
    let\ a=\bold{1}, b_i = \frac{v_i}{x_i}\\
    \|\bold{1}\|_2^2 (\sum_i \frac{v_i^2}{x_i}) \geq (\sum_i \frac{v_i}{x_i})^2\\
    n \sum_i \frac{v_i^2}{x_i^2} - (\sum_i \frac{v_i}{x_i})^2 \geq 0\\
    v^T \nabla^2 f v \leq 0 \iff f\ concave
  \end{align*}

  \subsection{quadratic over linear, convex}
  \begin{align*}
    f(x,y) = \frac{h(x)}{g(y)}, g(y) \ linear, g(y) \in R_+\\
    \nabla^2 f = vv^T \ is\ PSD \iff f\ convex
  \end{align*}

  \vfill\null
  \columnbreak
  
  \section{Composition of functions}

  Mnemonic derivation from scalar composite function
  \begin{align*}
    f=h(g(x))\\
    f'=g'(x)h'(g(x))\\
    f''=g''(x)h'(g(x)) + (g'(x))^2 h''(g(x))\\
    \\
    \text{h convex \& non-decreasing, g convex } \implies \text{ f convex}\\
    h'' \geq 0, g''(x) \geq 0, h'(g(x)) \geq 0 \implies f'' \geq 0\\
    \\
    \text{h convex \& non-increasing, g concave } \implies \text{ f convex}\\
    h'' \geq 0, g''(x) \leq 0, h'(g(x)) \leq 0 \implies f'' \geq 0\\
    \\
    \text{h concave \& non-decreasing, g concave } \implies \text{ f concave}\\
    h'' \leq 0, g''(x) \leq 0, h'(g(x)) \geq 0 \implies f'' \leq 0\\
    \\
    \text{h concave \& non-increasing, g convex } \implies \text{ f concave}\\
    h'' \leq 0, g''(x) \geq 0, h'(g(x)) \leq 0 \implies f'' \leq 0\\
  \end{align*}

  \vfill\null
  
  \pagebreak
  
  \section{Convexity Preservation of Sets}
  \subsection{Intersection}
  \begin{align*}
    (\forall \alpha \in A) S_{\alpha}\text{ is convex cone} \implies\\
    \cap_{\alpha \in A} S_{\alpha} \text{ is convex cone}
  \end{align*}
  Any closed convex set can be represented by possibly infinitely many half spaces.\\
  
  \subsection{Affine functions}
  let $f(x)=Ax+b, f:\R^n \to \R^m$\\
  then if S is a convex set we have:
  \begin{itemize}
  \item project forward: $f(S) = \{ f(X) : X \in S \}$ is convex
  \item project back: $f^{-1}(S) = \{ X : f(X) \in S \}$ is convex
  \end{itemize}
  Example:
  \begin{align*}
    C = \{ y : y=Ax+b, \norm{x} \leq 1\}
  \end{align*}
  $\norm{x} \leq 1$ is convex, $Ax+b$ is affine $\implies$ C is convex\\
  Example:
  \begin{align*}
    C = \{ x : \norm{Ax+b} \leq 1 \}
  \end{align*}
    $\{y: \norm{y} \leq 1 \}$ is convex $\wedge$ $y$ is an affine function of $x \implies$ C is convex\\

  \vfill\null
  \pagebreak

  \section{Constraint Qualifications}

  \subsection{Slater's Constraint Qual.}
  Optimal solution is in relative interior: $x^* \in relint(S)$\\
  
  Inequalities $(\forall i)f_i(x)$ convex $\wedge\ f_i(x)<0 \implies$ Slater's constraint satisfied.\\
  
  Inequalities $(\forall i)f_i(x)$ affine $\implies (\forall i)f_i(x) \leq 0 \wedge (\exists i) f_i(x) < 0 \implies$ Slater's constraint satisfied.\\
  
  Achieving Slater's constraint implies 0 duality gap.
  \subsection{KKT}
  Assumes optimality achieved with 0 duality gap: $\nabla L(x^*,\lambda^*,v^*)=0$
  \begin{align*}
    &L(x^*,\lambda^*,v^*) = f_0(x^*) + \sum_i \lambda_i^* f_i(x^*) + \sum_i v_i h_i(x^*)\\
    &\nabla L(x^*,\lambda^*,v^*) = \nabla f_0(x^*) + \sum_i \lambda_i^* \nabla f_i(x^*) + \sum_i v_i \nabla h_i(x^*)
  \end{align*}
  We have the constraints:
  \begin{align*}
    &f_i(x^*) \leq 0\\
    &\lambda_i^* \geq 0\\
    &h_i(x^*)=0\\
    &\lambda_i^* f_i(x^*)=0\\
    &\nabla L(x^*,\lambda^*,v^*) = \nabla f_0(x^*) + \sum_i \lambda_i^* \nabla f_i(x^*) + \sum_i v_i \nabla h_i(x^*)
  \end{align*}

  Primal inequality constraints convex and equality constraints affine and KKT satisfied $\implies$ 0 duality gap with specified points for primal and dual. (sufficient).\\

  If Slater's constraint satisfied then the above is sufficient and necessary:\\
  Primal inequality constraints convex and equality constraints affine and KKT satisfied $\iff$ 0 duality gap with specified points for primal and dual.\\
  
  \vfill\null
  
  \pagebreak
  
  \section{Definitions}

  \subsection{Convex Function}
  \begin{align*}
    f(\theta x +(1-\theta) y) \leq \theta f(x) + (1-\theta) f(y), \forall \theta = [0,1]\\
  \end{align*}
  For convenience we sometimes define an extended value function:\\
  \begin{align*}
    \tilde{f}(x) = \begin{cases}
      f(x), & x \in dom(f)\\
      \infty, & other\ wise\\
    \end{cases}\\
  \end{align*}
  if $f(x)$ convex, then $\tilde{f}$ is also convex\\

  Sublevel set of a function
  \begin{align*}
    C(\alpha) = \{ x \in dom(f): f(x) \leq \alpha \}
  \end{align*}
  For convex function, all sublevel sets are convex ($\forall \alpha$). Converse is not true.\\
  
  Quasi-convex function: if its sublevel sets are all convex.\\
  
  Epigraph of functions:\\
  $epi(f)=\{(x,t): x \in dom(f), f(x) \leq t\} \in \R^{n+1}, \\f \in \R^n \to \R$.\\
  
 $f$ is convex function $\iff epi(f)$ is convex set\\

 \subsection{First order condition}
 Suppose f is differentiable and domain of f is convex. Then:\\
 f convex $\iff \\ (\forall x,x_0 \in dom(f)) f(x) \geq f(x_0) + \nabla f(x_0)^T(x-x_0)$\\

  rough proof:\\
  suppose $f(x)$ is convex but $(\exists x, x_0) f(x) < f(x) + \nabla f(x_0)^T(x-x_0)$\\
  then this means the function should bend across the tangent line which violates the convexity\\
  
  proof for converse direction:\\
  suppose that $(\exists x, x_0) f(x) \geq f(x) + \nabla f(x_0)^T(x-x_0)$\\
  to show that $f(x)$ is convex lets take $x,y \in dom(f), z= \theta x + (1-\theta)y$\\
  $\theta f(x) + (1-\theta) f(y) \geq f(z) + \nabla f(z)^T(\theta x - \theta z + (1-\theta)y - (1-\theta)z)$\\
  $\theta f(x) + (1-\theta) f(y) \geq f(\theta x +(1-\theta)y)$\\
  $f(x)$ is convex
  \subsection{Second order condition}
  Suppose $f$ is twice differentiable and $dom(f)$ is convex, \\
  then $f(x)$ is convex $\iff \nabla^2 f(x) \geq 0 $ (PSD, eg: wrt. $S_+^n$)\\
  
  proof for scalar case:\\
  suppose that $f(x)$ is convex, then the first-order condition holds\\
  for $x,y \in dom(f): f(x) \geq f(y) + f'(y)(x-y)$\\
  for $y,x \in dom(f): f(y) \geq f(x) + f'(x)(y-x)$\\
  $f'(x)(y-x) \leq f(y)-f(x) \leq f'(y)(y-x)$\\
  $f'(x)(y-x) \leq f'(y)(y-x) \implies 0 \leq (y-x)(f'(y)-f'(x))$\\
  % $\frac{x}{(y-x)^2} \implies 0 \leq \frac{f;(y)-f'(x)}{(y-x)}$
  take $y\to x: 0 \leq f''(x)$\\
  $f''(x) \geq \frac{f'(x+\delta x)-f'(x)}{\delta x}$\\

  conversely, suppose that $f'(z) \geq 0, \forall z \in dom(f)$, take $x,y \in dom(f)$ WLOG $x < y$\\
  $\int_x^y f''(z)(y-z) dz \geq 0$\\
  $f''(z) \geq 0, (y-z) \geq 0$\\
  $I_1 = \int_x^y f''(z)y dz =  y(f'(y)-f'(x))$\\
  $I_2 = -\int_x^y zf''(z) dz$\\
  $dv=f''(z) dz \implies v = f'(z)$\\
  $u=z\implies du = dz$\\
  $I_2 = -[z f'(z)]|_x^y + \int_x^y f'(z) dz = -y f'(y) + x f'(x)+f(y)-f(x)$\\
  $I_1+I_2=y f'(y)-y f'(x)-y f'(y) + x f'(x) + f(y)-f(x) \geq 0$\\
  $\implies f(y) \geq f(x) + f'(x)(y-x)$ first order condition: $x<y$\\
  first order condition holds $\implies f(x)$ convex\\

  \vfill\null
  \columnbreak

  \subsection{Inequalities}
  $x \preceq_K y \iff y-x \in K$\\
  
  $x$ is a minimum in $S$ wrt. cone $K$:
  \begin{align*}
    &x \in S: (\forall y \in S)f(y) \succeq_K f(x) \iff f(y)-f(x) \in S\\
    &S \subseteq x+K
  \end{align*}
  $x$ is a minimal in $S$ wrt. cone $K$:
  \begin{align*}
    &x \in S: (\forall y \in S)f(y) \preceq_K f(x) \implies x = y\\
    &x \in S: (\forall y \in S) f(x)-f(y) \in K \implies x = y\\
    &(x-K) \cap S = \set{x}
  \end{align*}
  \subsection{Cone}
  \begin{align*}
    (\forall x \in C, \forall \theta \geq 0) \theta x \in C
  \end{align*}
  \subsection{Convex Cone}
  Eg: $S^n, S^n_+$ are convex cones\\
  convexity check for $S^N_+$:\\
  \begin{align*}
    x_1 \in S^n_+ \implies v^Tx_1v \geq 0\\
    x_2 \in S^n_+ \implies v^Tx_2v \geq 0\\
    x=\theta x_1 +(1-\theta)x_2, \theta \in [0,1]\\
    v^T(\theta x_1 + (1-\theta)x_2)v \geq 0\\
    v^T\theta x_1 v + (1-\theta)v^T x_2 v \implies x \in S^n_+\\
  \end{align*}
  convexity check for cone:\\
  \begin{align*}
    x_1 \in S^n_+ \implies \theta x_1 \in S^n_+, \theta \geq 0\\
    (\forall v) v^T x v \geq 0 \implies v^T(\theta x) v \geq 0 \implies \text{ cone}
  \end{align*}
  
  \subsection{Proper Cone}
  Definition:
  \begin{itemize}
  \item convex
  \item closed(contains all limit points)
  \item solid(non-empty interior)
  \item pointed(contains no line): $x \in K \implies -x \not\in K$
  \end{itemize}
  Then the proper cone K defines a generalized inequality ($\leq_K$) in $\R^n$
  \begin{align*}
    x \leq_K y \implies y-x \in K\\
    x <_K y \implies y-x \in int(K)
  \end{align*}
  Example: $K=R^n_+$ (non-negative orthant):
  \begin{align*}
    n = 2\\
    x \leq_{R^2_+} y \implies y-x \in R^2_+
  \end{align*}
  Cone provides partial ordering using difference of 2 objects.\\
  $X \leq_{S^n_+} Y \iff Y-X \in S^n_+ \iff Y-X$ is PSD

  \subsection{Norm Cone}
  \begin{align*}
    K=\set{(x,t) \in \R^{n+1} : \norm{x} \leq t}, x \in \R^n
  \end{align*}
  
  \subsection{Dual norm}
  $\|z\|_* := \sup_x \{z^T x : \|x\|_p \leq 1\}$\\
  
  Dual of L1-norm:\\
  $\|z\|_* := \sup_x \{z^T x : \|x\|_1 \leq 1\}$\\
  max $\sum_i z_i x_i$,\\
  subject to : $\sum_i \|x_i\| \leq 1$\\
  select $x_i$ corresponding to $z_i$ with maximum absolute value\\
  equivalent to $\|z\|_* = \|z\|_{\infty}$\\

  Dual of L-$\infty$-norm:\\
  $\|z\|_* := \sup_x \{z^T x : \|x\|_{\infty} \leq 1\}$\\
  max $\sum_i z_i x_i$,\\
  subject to : $\|x_i\| \leq 1, \forall i$\\
  choose $x_i=1$ if $z_i \geq 0$ and $x_i=-1$ if $z_i < 0$\\
  equivalent to $\|z\|_* = \|z\|_1$\\

  Dual norm of Lp-norm: Lq-norm where $1/p + 1/q = 1$\\

  Properties of Dual Cone:
  \begin{itemize}
  \item $K^*$ closed and convex
  \item $K_1 \subseteq K_2 \implies K_2^* \subseteq K_1^*$
  \item $K$ has non-empty interior $\implies K^*$ pointed
  \item $cl(K)$ pointed $\implies K^*$ has non-tempty interior
  \item $K^{**} = cl(convhull(K))$, useful for relaxed optimization
  \item $K$ convex and closed $\implies K=K^{**}$
  \end{itemize}
  
  \subsection{Operator norm}
  $\norm{X}_{a,b}=sup\{\norm{Xu}_a : \norm{u}_b \leq 1 \}, X \in \R^{m\times n}$
  
  \subsection{Dual cone}
  \begin{align*}
    K\ is\ a\ cone\\
    K^* = \{y:x^T y \geq 0, \forall x \in K\}
  \end{align*}

  \subsection{Dual norm cone}

  \begin{align*}
    K^* = \set{(u,v): \norm{u}_* \leq v}\\
    where\ K = \set{(x,t): \norm{x} \leq t}
  \end{align*}
  
  % $\norm{u}_* = sup \set{ u^Tx: \norm{x} \leq 1}$\\
    
  % interpreted as norm of $u^T$\\
  % from dual norm definition:\\
  % $z^T x \leq \norm{x} \norm{z}_*$\\
  % can be tightened: given x, $(\exists z) z^T x = \norm{x} \norm{z}_*$\\
  % given z, $(\exists x) z^T x = \norm{x} \norm{z}_*$\\
  
  \subsection{support function of a set}
  \begin{align*}
    S_C(x) = \sup \set{x^T y : y \in C}\\
    dom(S_C) = \set{x: \sup_{y\in C} x^Ty < \infty}
  \end{align*}
  It is pointwise supremum of convex function, so it is convex.
  
  \vfill\null

  \pagebreak
  
  \section{Relaxation}

  projection onto the feasible set\\

  take a larger feasible set and optimize in it instead, resulting optimal value is smaller or equal to the original\\

  equality contraints that are convex but not affine, make them inequalities thus transforming to convex problem
  
  \vfill\null
  
  \pagebreak

  \section{Regularized Approximation}

  Noise sensitivity of different objectives:
  \begin{itemize}
  \item robust least squares / Huber penalty
  \item log barrier
  \item deadzone linear
  \item quadratic
  \end{itemize}

  Least norm problems
  \begin{itemize}
  \item L2 norm objective with equality constraint
  \item sparsity inducing norms (eg: L1)
  \item norm ball constraint
  \item probability distribution
    \begin{itemize}
    \item convex comb. of columns of A to fit b
    \end{itemize}
  \item variable constraints
    \begin{itemize}
    \item box
    \item one sidded bound
    \end{itemize}
  \end{itemize}

  Multicriterion Formulation\\

  Tikhonov Regularization\\
  todo..
  
  \vfill\null
  
  \pagebreak

  \section{Descent Methods}

  \begin{algorithm}[H]
    init $x_0 \in dom f$\;
    \While{stopping criterion is not satisfied}{
      $\Delta x \leftarrow$ Compute Descent Direction\;
      $t \leftarrow$ Compute Descent Step Size\;
      $x_{k+1} \leftarrow x_k + t \Delta x$
    }
    \caption{Descent Overview\label{Descent}}
  \end{algorithm}
  
  \subsection{Search Step Size}
  Given $\Delta x$, step direction\\
  Search step size:\\
  Exact Line Search: $t = \argmin_{s\geq 0} f(x+s \Delta x)$\\

  \begin{algorithm}[H]
    \SetKwInOut{Input}{$\Delta x$}\SetKwInOut{Output}{t}
    \Input{Search direction}
    \Output{Step size}
    $\alpha \in (0,0.5)$\;
    $\beta \in (0,1)$\;
    $t \leftarrow$ 1\;
    \While{$f(x+ t \Delta x > f(x) + \alpha t \nabla f(x)^T \Delta x$}{
      $t \leftarrow \beta t$\;
    }
    \caption{Backtracking Line Search\label{LS_BT}}
  \end{algorithm}

  \vfill\null
  \columnbreak
  
  \subsection{Search Direction - 1st Order Methods}
  Steepest Descent
  \begin{align*}
    % f(x+\Delta x) \geq f(x) + \nabla f(x)^T \Delta x\\
    &\argmin_{\Delta x} f(x) + \nabla f(x)^T \Delta x_{sd}\\
    &\argmin_{\Delta x} \nabla f(x)^T \Delta x_{sd}
  \end{align*}
  Normalized Steepest Descent
  \begin{align*}
    &\Delta x_{nsd} = \argmin_{v} \set{ \nabla f(x)^T v : \norm{v} \leq 1 }\\
    &\Delta x_{nsd} = \frac{\Delta x_{sd}}{\norm{\nabla f(x)}}_*\\
    &\norm{\nabla f(x)}_* = \sup_y \{\nabla f(x)^T y: \|y\| \leq 1 \}\ (dual norm)\\
    &\nabla f(x)^T \Delta x_{sd} = \nabla f(x)^T \Delta x_{nsd} \norm{\nabla f(x)}_* = \norm{\nabla f(x)}_*^2
  \end{align*}

  Steepest Descent for L2-Norm
  \begin{align*}
    &\Delta x = -\nabla f(x)\\
    &\Delta x_{nsd} = \frac{-\nabla f(x)}{\norm{\nabla f(x)}}_*\\
  \end{align*}
  
  Steepest Descent for Quadratic Norm
  \begin{align*}
    &\norm{z}_P = (z^TPz)^{\frac{1}{2}} = \norm{P^{\frac{1}{2}}z}_2, P \in S_{++}^n\\
    &\Delta x_{nsd} = -(\nabla f(x)^T P^{-1} \nabla f(x))^{-\frac{1}{2}} P^{-1} \nabla f(x)\\
    &\Delta x_{sd} = -P^{-1} \nabla f(x)
  \end{align*}

  Normalized Steepest Descent for L1-norm
  \begin{align*}
    &\Delta x_{nsd} = \argmin_{v} \set{ \nabla f(x)^T v : \norm{v}_1 \leq 1 }\\
    &\Delta x_{nsd} = - sign (\frac{\partial f(x)}{\partial x_i})\ e_i, i: \norm{\nabla f(x)}_{\infty} = |(\nabla f(x))_i|
  \end{align*}
  \vfill\null  
  \pagebreak
  
  \subsubsection{Search Direction - 2nd Order Methods,\\ Unconstrained}
  Newton's Method
  \begin{align*}
    &\Delta x_{nt} = - \nabla^2 f(x)^{-1} \nabla f(x)\\
    &\nabla^2 f(x)^{-1} \succ 0 \implies\\
    &\nabla f(x)^T \Delta x_{nt} = - \nabla f(x)^T \nabla^2 f(x)^{-1} \nabla f(x) < 0
  \end{align*}
  Affine invariance of Newton's method: auto-scaling of level curves to enable better descent direction\\
  
  2nd order Taylor approximation of function is minimized with $\Delta x_{nt}$, so quadratic model gives good approximation near minimizer.\\

  Newton Decrement: used for convergence/stopping criterion
  
  Norm of Newton Step:
  \begin{align*}
    \lambda(x) = (\Delta x_{nt}^T \nabla^2 f(x) \Delta x_{nt})^{\frac{1}{2}} = \norm{(\nabla^2 f(x) \Delta x_{nt})^{\frac{1}{2}}}_2
  \end{align*}
  Bounding difference of lower bound of 2nd order model and $f(x)$:
  \begin{align*}
    f(x) - \inf & \{ f(x) + \nabla f(x)^T v + \frac{1}{2} v^T \nabla^2 f(x) v\\
                &: A(x+v) = b \} = \frac{1}{2} \lambda(x)^2
  \end{align*}
  \begin{align*}
    % &\lambda(x) = (\nabla f(x)^T \nabla^2 f(x) ^{-1} \nabla f(x))^{\frac{1}{2}}\\
    &\frac{1}{2}\lambda(x)^2 \approx f(x) -p^*\\
    &\nabla f(x)^T \Delta x_{nt} = - \nabla f(x)^T \nabla^2 f(x)^{-1} \nabla f(x)\\
    &\nabla f(x)^T \Delta x_{nt} = -\lambda(x)^2 = \frac{d}{dt}f(x+\Delta x_{nt}t)|_{t=0}
  \end{align*}
  
  \begin{algorithm}[H]
    init $x_0 \in dom f$\;
    \Do{$\frac{1}{2}\lambda(x)^2>\epsilon$}{
      $\Delta x_{nt} \leftarrow -\nabla^2 f(x)^{-1} \nabla f(x)$\;
      $\lambda(x)^2 \leftarrow \nabla f(x)^T \nabla^2 f(x)^{-1} \nabla f(x)$\;
      $t \leftarrow$ Compute Step Size (line search)\;
      $x_{k+1} \leftarrow x_k + t \Delta x_{nt}$
    }
    \caption{Newton Method Descent\label{NewtonMethod}}
  \end{algorithm}
  Assumptions of above algorithm: KKT matrix invertible. Descent method is decreasing wrt. $f$.\\
  
  Convergence: split into damped phase and quadratically convergent phase. Fast convergence once step size is 1.

  \vfill\null
  \columnbreak

  BFGS: a Quasi-newton method\\
  Direction:
  \begin{align*}
    p_k = -B_k^{-1} \nabla f_k
  \end{align*}
  Properties for $B$:\\
  $B \succ 0$\\
  $B=B^T$\\
  Satisfaction of Secant equation:
  \begin{align*}
    &B_{k+1} (x_{k+1}-x_k) = \nabla f_{k+1} - \nabla f_K\\
    &B_{k+1} s_k = y_k
  \end{align*}
  Curvature condition:
  \begin{align*}
    &B_{k+1} s_k = y_k\\
    &s_k^T B_{k+1} s_k = s_k^T y_k\\
    &B_{k+1} \succ 0 \implies s_k^T y_k > 0
  \end{align*}
  This need to be enforced if doing optimization on a non-convex function\\

  General problem of find $B$
  \begin{align*}
    &\min_B \norm{B-B_k}\\
    &s.t.\ B=B^T, Bs_k = y_k
  \end{align*}
  
  BFGS Update:
  $B_{k+1}=B_{k}+{\frac {\mathbf {y} _{k}\mathbf {y} _{k}^{\mathrm {T} }}{\mathbf {y} _{k}^{\mathrm {T} }\mathbf {s} _{k}}}-{\frac {B_{k}\mathbf {s} _{k}\mathbf {s} _{k}^{\mathrm {T} }B_{k}^{\mathrm {T} }}{\mathbf {s} _{k}^{\mathrm {T} }B_{k}\mathbf {s} _{k}}}$

  \begin{algorithm}[H]
    init $x_0 \in dom f$\\
    init $B_{0}$ (eg: $I$)\\
    \While{$\norm{y_k}>\epsilon$}{
    solve for $p_k$ in $B_{k} p_k = -\nabla f(x_k)$\\
    $t \leftarrow \argmin_s f(x_k + s p_k)$(or backtrack search)\\
    $s_k \leftarrow t p_k$\\
    $x_{k+1} \leftarrow x_k + s_k$\\
    $y_k = f(x_{k+1}) - f(x_k)$\\
    $B_{k+1}=B_{k}+{\frac {\mathbf {y} _{k}\mathbf {y} _{k}^{\mathrm {T} }}{\mathbf {y} _{k}^{\mathrm {T} }\mathbf {s} _{k}}}-{\frac {B_{k}\mathbf {s} _{k}\mathbf {s} _{k}^{\mathrm {T} }B_{k}^{\mathrm {T} }}{\mathbf {s} _{k}^{\mathrm {T} }B_{k}\mathbf {s} _{k}}}$
    }
    \caption{BFGS Descent\label{BFGS}}
  \end{algorithm}

  Inversion of $B$ via Sherman-Morrison:\\
  ${\displaystyle B_{k+1}^{-1}=\left(I-{\frac {\mathbf {s} _{k}\mathbf {y} _{k}^{T}}{\mathbf {y} _{k}^{T}\mathbf {s} _{k}}}\right)B_{k}^{-1}\left(I-{\frac {\mathbf {y} _{k}\mathbf {s} _{k}^{T}}{\mathbf {y} _{k}^{T}\mathbf {s} _{k}}}\right)+{\frac {\mathbf {s} _{k}\mathbf {s} _{k}^{T}}{\mathbf {y} _{k}^{T}\mathbf {s} _{k}}}.}$

  Sherman-Morrison:\\
  $A^{-1}$ exists: $(A+uv^T)^{-1}$ exists $\iff 1+v^TA^{-1}u \not=0$\\
  $(A+uv^T)^{-1}=A^{-1} - \frac{A^{-1}uv^T A^{-1}}{1+v^TA^{-1}u}$\\
  
  \vfill\null
  \pagebreak
  \section{Equality Constrained Optimization}
  Approaches:
  \begin{itemize}
  \item elimination of equality constraint, variable substitution into objective, unconstrained opt.
  \item Newton's Method with equality constraint, and assumed feasible start, unconstrained opt.
  \item Newton's Method with equality constraint,\\ infeasible start,\\ equivalence using primal-dual residual method
  \end{itemize}  
  KKT optimality, with x assumed to be feasible:
  \begin{align*}
    &\Delta x = \argmin_v f(x) + \nabla f(x)^T v + \frac{1}{2} v^T \nabla^2 f(x) v\\
    &s.t.\ A(x+v)=b\\
    &optimality\ condition:\\
    &L(v,w) = f(x) + \nabla f(x)^T v + \frac{1}{2} v^T \nabla^2 f(x) v\\
    &+ w^T(A(x+v)-b)\\
    &\frac{\partial L(v,w)}{\partial v} = \nabla f(x) + \nabla^2 f(x) v + A^T w = 0\\
    &A(x+v) = b\\
    &Ax = b \implies Av = 0\\
    & KKT\ system:\\
    &\begin{bmatrix}
        \nabla^2 f(x) & A^T \\
        A & 0 
      \end{bmatrix}
      \begin{bmatrix}
        v\\ w
      \end{bmatrix}=
            K
      \begin{bmatrix}
        v\\ w
      \end{bmatrix} =
    \begin{bmatrix}
        -\nabla f(x)\\
        0
      \end{bmatrix}\\
    &\begin{bmatrix}
        v\\ w
      \end{bmatrix} = K^{-1}
      \begin{bmatrix}
        -\nabla f(x)\\
        0
      \end{bmatrix}, K^{-1}\ exists\\
    &\Delta x = v
  \end{align*}
  
  Alternatively Solve via elimination
  \begin{align*}
    &\nabla f(x) + \nabla^2 f(x) v + A^T w = 0\\
    &v + \nabla^2 f(x)^{-1} A^T w + \nabla^2 f(x)^{-1} \nabla f(x)=0\\
    &Av + A \nabla^2 f(x)^{-1} A^T w + A \nabla^2 f(x)^{-1} \nabla f(x)=0\\
    &A \nabla^2 f(x)^{-1} A^T w + A \nabla^2 f(x)^{-1} \nabla f(x)=0\\
    &w = - (A \nabla^2 f(x)^{-1} A^T)^{-1} A \nabla^2 f(x)^{-1} \nabla f(x)\\
    &v = \nabla^2 f(x)^{-1} (-\nabla f(x) - A^T w)\\
    &\Delta x = v
  \end{align*}
  $\nabla^2 f(x)$ not invertible, augment KKT system s.t.\\
  $(\exists Q) \nabla^2 f(x) + AQA > 0$
  \begin{align*}
    &\begin{bmatrix}
      \nabla^2 f(x) + AQA & A^T \\
      A & 0 
    \end{bmatrix}
    \begin{bmatrix}
      v\\ w
    \end{bmatrix} =
    \begin{bmatrix}
      -\nabla f(x)\\
      0
    \end{bmatrix}\\
    &Av=0\\
    &AQAv = 0 \implies\ solution\ same\ to\ original\ problem
  \end{align*}
  
  \vfill\null
  \columnbreak
  
  \begin{algorithm}[H]
    init $x_0 \in dom f, Ax_0=b$\;
    \Do{$\frac{1}{2}\lambda(x)^2>\epsilon$}{
      $\Delta x_{nt} \leftarrow$ Solve KKT System / Elimination\;
      $\lambda(x)^2 \leftarrow \Delta x_{nt}^T \nabla^2 f(x)^{-1} \Delta x_{nt}$\;
      $t \leftarrow$ Compute Step Size (eg: backtrack)\;
      $x_{k+1} \leftarrow x_k + t \Delta x_{nt}$
    }
    \caption{Newton Method w/ Equality Constraint\label{NewtonMethodEq}}
  \end{algorithm}

  \subsection{Infeasible Start Newton Method}
  Modify KKT system for residual: $Av=b-Ax$
  \begin{align*}
    &\begin{bmatrix}
      \nabla^2 f(x) & A^T \\
      A & 0 
    \end{bmatrix}
          \begin{bmatrix}
            v\\ w
          \end{bmatrix}=
    K
    \begin{bmatrix}
      v\\ w
    \end{bmatrix} =
    \begin{bmatrix}
      -\nabla f(x)\\
      b-Ax
    \end{bmatrix}
  \end{align*}
  Once step length becomes 1, all following iterates are feasible.\\

  Equivalence with Primal-Dual residual update:
  \begin{align*}
    &\min_x f(x), s.t.\ Ax=b
  \end{align*}
  Dual:
  \begin{align*}
    &L(x,v) = f(x) + v^T(Ax-b)\\
    &s.t.\ Ax-b=0\\
    &optimality:\\
    &\frac{\partial L}{\partial x} = \nabla f(x) + A^Tv = 0
  \end{align*}
  residual of primal and dual:
  \begin{align*}
    r=\begin{bmatrix}
      r_{dual}\\
      r_{pri}
    \end{bmatrix}
    =\begin{bmatrix}
      \nabla f(x) + A^Tv\\
      Ax-b
    \end{bmatrix}
  \end{align*}
  \begin{align*}
    &y =
    \begin{bmatrix}
      x\\ v
    \end{bmatrix}\\
    &\Delta y =
    \begin{bmatrix}
      \Delta x\\ \Delta v
    \end{bmatrix}\\
    &r(y+\Delta y) \approx r(y) + Dr(y) \Delta y\\
    &Dr(y):=Gradient\ of\ r(y)\\
    &Dr(y) =
      \begin{bmatrix}
        \nabla_x r_{dual}^T & \nabla_v r_{dual}^T\\
        \nabla_x r_{pri}^T & \nabla_v r_{pri}^T
      \end{bmatrix}\\
    &Dr(y) =
      \begin{bmatrix}
        \nabla^2 f(x) & A^T\\
        A & 0
      \end{bmatrix}                             
  \end{align*}
  goal: $r(y+\Delta y) \to 0$
  \vfill\null
  \pagebreak
  \begin{align*}
    &r(y) + Dr(y) \Delta y = 0\\
    &\begin{bmatrix}
      \nabla f(x) + A^Tv\\
      Ax-b
    \end{bmatrix} + 
    \begin{bmatrix}
        \nabla^2 f(x) & A^T\\
        A & 0
    \end{bmatrix}
    \begin{bmatrix}
      \Delta x\\ \Delta v
    \end{bmatrix} = 0\\
    &\begin{bmatrix}
        \nabla^2 f(x) & A^T\\
        A & 0
    \end{bmatrix}
    \begin{bmatrix}
      \Delta x\\ \Delta v
    \end{bmatrix} =
    \begin{bmatrix}
      -\nabla f(x) - A^Tv\\
      b-Ax
    \end{bmatrix}
  \end{align*}
  comparison with defintion of infeasible start Newton Method:
  \begin{align*}
    &\begin{bmatrix}
      \nabla^2 f(x) & A^T \\
      A & 0 
    \end{bmatrix}
          \begin{bmatrix}
            v\\ w
          \end{bmatrix}=
    \begin{bmatrix}
      -\nabla f(x)\\
      b-Ax
    \end{bmatrix}
  \end{align*}
  Equivalence if for infeasible start, we select:
  \begin{align*}
    &v = \Delta x\\
    &A^T(\Delta v + v) = A^T w = 0\\
    &w = v + \Delta v
  \end{align*}
  Solving using primal-dual formulation, obtain $\Delta v, \Delta x$\\
  Backtrack line search using $\norm{r}$ instead of $f$\\

  \begin{algorithm}[H]
    init $x_0 \in dom f$\\
    $\beta \in (0,1)$\\
    $\alpha \in (0,0.5)$\\
    \Do{$\norm{r(x,v)}>\epsilon \vee Ax \not=b$}{
      $\Delta v, \Delta x \leftarrow$ Solve primal-dual KKT system\\
      Backtrack Step Size Search with $\norm{r}$:\\
      $t \leftarrow 1$\\
      \While{$\norm{r(x+ t \Delta x, v + t \Delta v)}>(1-\alpha t)\norm{r(x,v)}$}{
        $t \leftarrow \beta t$\\
      }
      $x_{k+1} \leftarrow x_k + t \Delta x$\\
      $v_{k+1} \leftarrow v_k + t \Delta v$\\
    }
    \caption{Newton Method w/ Infeasible Start\label{NewtonMethodInfeasibleStart}}
  \end{algorithm}

  \vfill\null
  \pagebreak
  
  \section{Inequality Constrained Optimization}
  Ideas:
  \begin{itemize}
  \item gradient projection: update x with descent direction, then project back onto feasibility set. Assume feasibility set is convex.
    \begin{align*}
      &x \leftarrow descent\ update\\
      &\tilde{s} = \argmin_s \norm{x-s}, s.t.\ s \in X=feasibility\ set\\
      &let\ [x]^+ = \argmin_s \norm{x-s}, s.t.\ s \in X\\
      &x_{k+1} \leftarrow [x_k + t \Delta x]^+, eg: \Delta x = \nabla f(x)
    \end{align*}
    Issues: projection hard in general, line search becomes harder. Ok for simple projection (eg: box constraints).
  \item Adapt Newton's Method with projection.
    \begin{align*}
      \min_v \nabla f(x)^T v + \frac{1}{2} v^T \nabla^2 f(x) v\\
      s.t.\ x_{k+1} = x_k+v \in X
    \end{align*}
    Issues: Hard to solve.
  \item elimination inequality constraint and augment objective (eg: Interior Point)
  \end{itemize}
  \subsubsection{Interior Point with Inequality \&\\ Equality Constraints}
  \begin{align*}
    &\min_x f_0(x)\\
    &s.t.\ f_i(x) \leq 0, \forall i\in\set{1,..,m}\\
    & Ax = b
  \end{align*}
  Assumptions:
  \begin{itemize}
  \item solution exists
  \item strict feasibility (Slater's cond. hold), so strong duality
  \item objective and inequality functions differentiable and convex
  \end{itemize}

  Barrier Method:
  \begin{align*}
    &\min_x f_0(x) + \sum_{i=1}^m I(f_i(x))\\
    &s.t.\ Ax = b\\
    &I(u) =
      \begin{cases}
        0 &, u \leq 0\\
        +\infty &, o/w
      \end{cases}
  \end{align*}

  \vfill\null
  \columnbreak
  
  $I(u)$ convex, non-decreasing, but non-differentiable.\\
  
  Use log barrier function to approximate $I(u)$
  \begin{align*}
    &\hat{I}(u) = -\frac{1}{t}log(-u)\\
    &\hat{I}(u)|_{t \to +\infty} \to I(u)
  \end{align*}
  Log barrier:
  \begin{align*}
    &\Phi(x) = -\sum_i log(-f(x))
  \end{align*}
  \begin{align*}
    &\min_x f_0(x) - \frac{1}{t} \sum_{i=1}^m log(-f_i(x))\\
    &\min_x f_0(x) + \frac{1}{t} \Phi(x)\\
    &\min_x\ t f_0(x) + \Phi(x)\\
    &s.t.\ Ax = b
  \end{align*}
  Approach: fix $t$, optimize problem to obtain $x^*(t)$. Repeat for $t$ increasing in value with previously solved intermediate solution so that $x^*(t)|_{t\to +\infty} = x^*$.\\

  Find a starting strictly feasible point $x$. Set $t_0>0$.
  Inner optimization problem, solve $x^*(t)$(via iterative algo such as Newton):
  \begin{algorithm}[H]

    \Do{$\frac{m}{t} < \epsilon$ not met}{
      $x^*(t) = \min_x\ t f_0(x) + \Phi(x)$, s.t. $Ax = b$\\
      update:\\
      $x=x^*(t)$\\
      $t\leftarrow \mu t, \mu > 1$ (eg: 10)
    }
    \caption{Log Barrier Method\label{LogBarrierMethod}}
  \end{algorithm}

  Central Path:\\
  Successive solving of inner optimization problem with varying $t$ traces out a path, leading closer to the optimal final solution.\\

  Central path lies in the interior of the feasibility region, thus potential optimal solution on the boundary of the feasibility set is never exactly reached.\\

  Inner optimization usually uses 2nd order methods for auto-scaling of level set of optimization objective when $t$ is large.\\

  \vfill\null
  \pagebreak
  Search for Initial Feasible Point (Phase 1):\\
  Formulate problem as:\\
  \begin{align*}
    &\min_{x,s} s\\
    &s.t.\ f_i(x) \leq s, \forall i\\
    &Ax=b
  \end{align*}
  If $s^* < 0$, $x^*$ is strictly feasible. Else infeasibility.\\
  
  Initialization of Phase 1 problem: set $s > \max_i f_i(x)$ so that starting point is interior of feasible set. Then proceed to solve using Interior Point method.\\

  If $s^* < 0$, then original problem is solved using $x^*$ as the strictly feasible starting point.

  Interpretation of Interior Point Method with KKT:\\
  Primal of Original:
  \begin{align*}
    &\min_x f_0(x)\\
    &s.t.\ f_i(x) \leq 0, \forall i\in\set{1,..,m}\\
    & Ax = b
  \end{align*}
  Lagrangian of Primal:\\
  $f_0(x) + \sum_i \lambda_i f_i(x) + w^T(Ax-b)$\\
  KKT optimality conditions:
  \begin{align*}
    &\nabla f_0(x) + \sum_i \lambda_i \nabla f_i(x) + A^T w = 0\\
    &\lambda_i \geq 0\\
    &Ax-b = 0\\
    &\lambda_i f_i(x) = 0\\
    &f_i(x) \leq 0
  \end{align*}

  Primal of Log-Barrier:
  \begin{align*}
    &\min_x f_0(x) - \frac{1}{t} \sum_i log(-f_i(x))\\
    & s.t.\ Ax = b
  \end{align*}
  KKT optimality conditions for Log-Barrier problem:
  \begin{align*}
    &\nabla f_0(x) - \frac{1}{t} \sum_i \frac{\nabla f(x)}{f_i(x)} + A^T w = 0\\
    &Ax - b = 0\\
    &f_i(x) \leq  0\ (domain\ of\ log)\\
    &let\ \lambda_i = -\frac{1}{t f_i(x)}\\
    &\nabla f_0(x) + \sum_i \lambda_i \nabla f(x) + A^T w = 0\\
    &Ax - b = 0
  \end{align*}
  \vfill\null
  \columnbreak
  
  \begin{align*}
    t > 0, f_i(x) \leq 0 \implies \lambda_i = -\frac{1}{t f_i(x)} \geq 0
  \end{align*}
  Overall (Modified) KKT conditions for Log-Barrier:
  \begin{align*}
    &\nabla f_0(x) + \sum_i \lambda_i \nabla f(x) + A^T w = 0\\
    &f_i(x) \leq  0\\
    &Ax - b = 0\\
    &\lambda_i \geq 0\\
    &\lambda_i f_i(x) = -\frac{1}{t}
  \end{align*}
  This approaches true KKT conditions as $t\to +\infty$.
  \vfill\null
  \pagebreak

  Obtaining a bound on duality gap of the modified problem from original:\\
  Dual of original:
  \begin{align*}
    &g(\lambda) = \min_{Ax=b} f_0(x) + \sum_i \lambda_i f_i(x)\\
    &g(\lambda) \leq f_0(x) + \sum_i \lambda_i f_i(x), Ax=b\\
    &g(\lambda) \leq f_0(x^*) + \sum_i \lambda_i f_i(x^*), Ax^*=b\\
    &(\forall i)f_i(x^*) \leq 0, \lambda_i \geq 0 \implies\\
    &g(\lambda) \leq f_0(x^*) + \sum_i \lambda_i f_i(x^*) \leq f_0(x^*)\\
    &g(\lambda) \leq f_0(x^*), \lambda \geq 0
  \end{align*}

  Log-Barrier Problem:
  \begin{align*}
    &\min_x f_0(x) - \frac{1}{t} \sum_i log(-f_i(x))\\
    & s.t.\ Ax = b    
  \end{align*}
  KKT optimality conditions:
  \begin{align*}
    &\nabla f_0(x) + \sum_i -\frac{1}{t f_i(x)} \nabla f(x) + A^T w = 0\\
    &f_i(x) \leq  0\\
    &Ax - b = 0\\
    &from\ dual:\\
    &g(\lambda) \leq f_0(x) + \sum_i \lambda_i f_i(x), Ax=b\\
    &\lambda_i = -\frac{1}{t f_i(x)}, f_i(x) \leq 0 \implies same\ KKT\ conditions\\
    &x^*(t)\ minimizes\ for\ particular\ \lambda_i^*(t) = -\frac{1}{t f_i(x)}\\
    &g(\lambda^*(t)) = f_0(x^*(t)) + \sum_i \lambda_i^*(t) f_i(x^*(t)), Ax^*(t)=b\\
    &g(\lambda^*(t)) = f_0(x^*(t)) + \sum_i -\frac{1}{t f_i(x^*(t))} f_i(x^*(t))\\
    &g(\lambda^*(t)) = f_0(x^*(t)) + \sum_{i=1}^m -\frac{1}{t}, Ax^*(t)=b\\
    &g(\lambda^*(t)) = f_0(x^*(t)) -\frac{m}{t} \leq f_0(x^*)\\
    &f_0(x^*(t)) \leq f_0(x^*) + \frac{m}{t}\\
    &f_0(x^*) \leq f_0(x^*(t)) \implies gives\ bound\ of\ gap: \frac{m}{t}\\
  \end{align*}
  $x^*(t)$ gives solution that is bounded by $\frac{m}{t}$ away from optimal wrt. objective.\\
  $\frac{m}{t}$ usable as stopping criterion.
  \vfill\null
  \columnbreak

  \subsubsection{Primal-Dual Interior Point Method\\ (Alternative to Barrier Method)}
  Overall (Modified) KKT conditions for Log-Barrier:
  \begin{align*}
    &\nabla f_0(x) + \sum_i \lambda_i \nabla f(x) + A^T v = 0\\
    &f_i(x) \leq  0\\
    &Ax - b = 0\\
    &\lambda_i \geq 0\\
    &-\lambda_i f_i(x) -\frac{1}{t} = 0
  \end{align*}
  \begin{align*}
    &y =
    \begin{bmatrix}
      x\\ \lambda\\ v
    \end{bmatrix}\\
    &r(y) =
    \begin{bmatrix}
      \nabla f_0(x) + \sum_i \lambda_i \nabla f_i(x) + A^T v\\
      -\lambda_1 f_1(x)-\frac{1}{t} = 0\\
      ..\\
      -\lambda_m f_m(x)-\frac{1}{t} = 0\\
      Ax-b
    \end{bmatrix}\\
  \end{align*}
  Goal: drive $r(y)$ to 0\\

  1st order approx of $r(y+\Delta y)$:
  \begin{align*}
    &r(y+\Delta y) \approx r(y) + Dr(y)\Delta y = 0\\
    &Dr(y) =
      \begin{bmatrix}
        \nabla^2 f_0(x) + \sum_i \lambda_i \nabla^2 f(x) & f_1'(x) & .. & f_m'(x) & A^T\\
        -\lambda_1 f_1'(x) & -f_1(x) & 0 & .. & \\
        .. & & ..& &\\
        -\lambda_m f_m'(x) & & 0 & -f_m(x) & 0\\
        A & 0 & .. & & 
      \end{bmatrix}\\
    & Dr(y)\Delta y = -r(y)
  \end{align*}
  Overall algorithm:\\
  \begin{algorithm}[H]
    init strictly feasible $x,\lambda>0, \eta = -\sum_i \lambda_i f_i(x), \mu=10$\\
    $\eta = -\sum_i \lambda_i f_i(x)$\\
    \Do{$\norm{r_{prim}}>\epsilon \vee \norm{r_{dual}}>\epsilon \vee \eta > \epsilon_2$}{
      $t \leftarrow \frac{\mu m}{\eta}$\\
      Solve for $\Delta y$ in $Dr(y) \Delta y = -r(y)$\\
      Line search for step size, $s$, using norm of residual $\norm{r}$, s.t. $\lambda>0,f(x)<0$\\
      $y\leftarrow y + s \Delta y$\\
      $\eta = -\sum_i \lambda_i f_i(x)$ (proxy for duality gap, the smaller the better)\\
    }
    \caption{IP Primal-Dual Method\label{IPPrimDual}}
  \end{algorithm}
  Note: one loop only, but stopping in middle of algorithm does not guarantee a feasible solution.
  \vfill\null
  \pagebreak

  \section{Ellipsoid Method}

  $f$ is convex, $C^{1}$ only\\
  $(\forall x, x_0) f(x) \geq f(x_0) + \nabla f(x_0)^T(x-x_0)$\\
  $\nabla f(x_0)^T (x-x_0) \geq 0 \implies f(x) \geq x(x_0)$\\
  Idea: use of halfspace to perform elimination of search space\\
  Selection of potential search points: use centroid of feasible polyhedron to maximize elimination space\\
  Volumne reduction: $(X_{max} - X_{min})/2^k$\\
  \begin{algorithm}[H]
    \Do{Not Satisfy stopping criterion}{
      update select $x^{(k)}$ using center of $C^{(k)}$ where $x^* \in C^{(k-1)}$\\
      $(\forall i ) C^{(k-1)}=\set{x: \nabla f(x^(i))^T (x-x^{(i)}) \leq 0 }$\\
      bisection:\\
      $f'(x^{(k)}(x-x^{(k)})$ eliminates 1 side of halfspace\\
      $C^{(k)}=C^{(k-1)} \cap \set{ x: \nabla f(x^{(k)})^T (x-x^{(k)}) \leq 0}$\\
    }
    \caption{Halfspace Elimination\label{HalfspaceElimi}}
  \end{algorithm}

  Pros:
  \begin{itemize}
  \item cnetroid selection easy
  \item bisection evaluation easy
  \item exponential rate
  \end{itemize}
  Implementation issues: centroid search, intersection in high dimension\\
  Simplify: ellipsoid instead of polyhedral for localization
  \begin{algorithm}[H]
    \Do{Not Satisfy stopping criterion}{
      set $x^{(k+1)}$ as center of $\varepsilon^{(k)}$\\
      eval $\nabla f(x^{(k+1)})$\\
      find min. volume ellipsoid covering:\\
      $S=\set{x: \nabla f(x^{(k+1)})(x-x^{(k+1)}) \leq 0}$\\
      $\varepsilon^{(k+1)}=\varepsilon \cap S$\\
    }
    \caption{Ellipsoid Method\label{algo:EllipsoidMethod}}
  \end{algorithm}

  Intersection search:\\
  $g=\nabla f(x^{(k+!)})$\\
  $\tilde{g} = \frac{g}{\sqrt{g^TPg}}$\\
  $\varepsilon^{(k)} = \set{x: (x-x^{(k+1)})^T P^{-1} (x-x^{(k+1)}) \leq 1}$\\
  $\varepsilon^{(k+1)} = \set{x: (x-x^+)^T (P^+)^{-1} (x-x^+) \leq 1}$\\
  $x^+ = x^{(k-1)} - \frac{1}{n+1} P \tilde{g}, n=dim\ of\ X$\\
  $P^+ = \frac{n^2}{n^2-1}(P-\frac{2}{n+1} P \tilde{g}\tilde{g}^T P)$\\
  $Vol(\varepsilon^{(k+1)}) \leq e^{-\frac{1}{2^n}} Vol(\varepsilon^{(k)})$\\
  \vfill\null
  \columnbreak
  \section{Subgradient Method}
  useful for $f$ convex, but not $C^{1},C^{2}$, and when subgradient of certain form is known\\
  
  Idea: need a $g$ such that:\\
  $(\forall x)f(x) \geq f(x_0)+g^T(x-x_0)$\\
  
  subdifferential:\\
  $\partial f(x)=\{ g \in \R^n: g\ \text{is a subgradient of }f \text{ at } x \}$\\

  properties:\\
  closed and convex for all f (nonconvex as well)\\
  can be empty for nonconvex f\\
  $f$ is atleast $C^1\ at\ x \iff \partial f(x) = \{ \nabla f(x) \}$\\
  $f\ convex \implies f(x^*) = min_x f(x) \iff 0 \in \partial f(x^*)$\\
  
  subgradient calculus:
  \begin{align*}
    (\forall a>0) &\partial (af) = a \partial f\\
    \partial(f_1+f_2) &= \partial f_1 + \partial f_2\\
    g(x)&=f(Ax+b) \implies \partial g(x)=A^T \partial f(Ax+b)\\
    f(x)&=max_i f_i(x) \implies\\
        &\partial f(x)=conv\bigg(\bigcup_{i: f_i(x)=f(x)} \partial f_i(x)\bigg)
  \end{align*}

  subgradient descent:\\
  eg: $\sum_k^{\infty} t_k^2 < \infty, \sum_k^{\infty} t_k = \infty, x^{k+1} = x^k - t_k g^{k}$\\
  keep track of the best $x$
  
  \subsection{Application}
  Decomposition of LP into decoupled systems:\\
  
  Original LP:
  \begin{align*}
    &\min_{u,v} c^Tu + d^Tv\\
    s.t. & Au \leq b\\
    & Pv \leq q\\
    &Fu + Gv \leq h
  \end{align*}

  Augment objective w/ constraint, partial Lagrandian
  \begin{align*}
    L(u,v,\lambda) &= c^Tu+d^tv + \lambda^T(Fu+Gv-h)\\
    g(\lambda) &= \min_{u,v} c^Tu+d^tv + \lambda^T(Fu+Gv-h)\\
    s.t. & Au \leq b\\
         & Pv \leq q
  \end{align*}

  For fixed $\lambda$, we can solve 2 separate LPs involving only u and v
  \begin{align*}
    \min_{u} & c^Tu + \lambda^T Fu\\
    s.t. & Au \leq b
  \end{align*}
  \begin{align*}
    \min_{v} & d^Tv + \lambda^T Gv\\
    s.t. & Pv \leq q
  \end{align*}      
  $\max.\ g(\lambda)$ s.t. $\lambda \geq 0$ gives optimal $\lambda^*$\\
  
  $g(\lambda)$ concave, but not differentiable\\
  Use subgradient, $s$, s.t. $g(\tilde{\lambda}) \leq g(\lambda) + s^T(\tilde{\lambda} - \lambda), \forall \tilde{\lambda}$\\

  $(u^*,v^*)$ for a fixed $\lambda$:\\
  $S=Fu^*+Gv^*-h$ is a subgradient of $g$ at $\lambda$\\
  
  $g(\tilde{\lambda}) = \min_{u,v} c^Tu + d^Tv + \tilde{\lambda}^T(Fu+Gv-h) \leq c^Tu^* + d^Tv^*+\tilde{\lambda}^T(Fu^8+Gv^*-h)$\\

  For $\forall u,v$ feasible:
  \begin{align*}
    g(\tilde{\lambda}) & = \min_{u,v} c^Tu + d^Tv + \tilde{\lambda}^T(Fu+Gv-h)\\
    s.t. & Au \leq b\\
         & Pv \leq q
  \end{align*}
  \begin{align*}
    g(\tilde{\lambda}) & \leq c^Tu^* + d^Tv^* + \tilde{\lambda}^T(Fu^*+Gv^*-h)\\
                       & +\lambda^T(Fu^*+Gv^*-h)\\
                       & -\lambda^T(Fu^*+Gv^*-h)\\
    g(\lambda) & = c^Tu^*+d^Tv^* + \lambda^T(Fu^*+Gv^*-h)\\
    % (\tilde{\lambda}-\lambda)^T&(Fu^*+Gv^*-h) = \tilde{\lambda}^T(Fu^*+Gv^*-h)\\
    %                    &-\lambda^T(Fu^*+Gv^*-h)\\
    g(\tilde{\lambda}) &= g(\lambda) + (\tilde{\lambda}-\lambda)^T(Fu^*+Gv^*-h)\\
                       &= g(\lambda) + s^T(\tilde{\lambda}-\lambda)
  \end{align*}

  $s=Fu^*+Gv^*-h$ is subgradient of $g$ at point $\lambda$

  $\max_{\lambda \geq 0} g(\lambda)$ for expression of subgradient\\

  Update rule for $\lambda$:\\
  $\lambda^{(k+1)}=[\lambda^{(k)} + \alpha_k s^{(k)}]^+$

  Interpretation of subgradient $s=Fu^*+Gv^*-h$\\

  Original problem $Fu+Gv-h \leq 0$\\

  $\lambda$, pricing variable associated w/ constraint
  
  Penalize objective w/ $\lambda^T(Fu+Gv-h)$

  $(\exists u^8, v^*) Fu^*+Gv^* \leq h \implies \lambda$ decreases\\
  $(\exists u^8, v^*) Fu^*+Gv^* \geq h \implies \lambda$ increases\\

  Summary for decoupling LP:
  \begin{align*}
    \min_{u,v} & c^t u + d^T v\\
          s.t. & Au \leq b\\
               & Pv \leq q\\
               & Fu+Gv \leq h
  \end{align*}
  Subproblems:
  \begin{align*}
    \min_{u} & c^t u + \lambda^T Fu\\
          s.t. & Au \leq b
  \end{align*}
  \begin{align*}
    \min_{v} & d^T v + \lambda^T Gv\\
          s.t. & Pv \leq q
  \end{align*}  

  \vfill\null
  \columnbreak
  
  Algorithms:\\
  \begin{algorithm}[H]
    set $\lambda^{(0)}>0$\\
    \Do{not meeting $\norm{f(u,v)-g(\lambda)}<\epsilon$}{
      solve 2 LP problems for fixed $\lambda^{(k)} \to (u^*,v^*)$\\
      calculate $s^(k)=Fu^*+Gv^*-h$\\
      update $\lambda^(k+1)=[\lambda^{(k)}+\alpha_k s^{(k)}]^+$
    }
    \caption{Decoupling LP with Subgradient\label{algo:LPSubgradient}}
  \end{algorithm}

  Choosing $\alpha_k$, derivation ultimately comes to:\\
  $\sum_{k=1}^{\infty} a_k^2 < \infty$, $\sum_{k=1}^{\infty} a_k \to \infty \implies$\\
  $\lambda^{(k)} \to \lambda^*,k\to\infty$\\
  eg: $\alpha_k = \frac{1}{k}$

  Issues with subgradient method:
  \begin{itemize}
  \item can be slow
  \item step size selection difficult
  \end{itemize}

  \vfill\null
 
  \pagebreak

  \section{Practical Optimization Strategies}
  
  Ellipsoid Method:
  \begin{itemize}
  \item slower
  \item easier to code
  \end{itemize}

  Interior Point Method:
  \begin{itemize}
  \item faster
  \item harder to code
  \end{itemize}

  Tradeoff:\\
  program vs. run time
  leverage existing software: use problem conversion (eg: LP to SDP)\\
  LP $\iff$ SDP:
  \begin{align*}
    \min_{x} c^Tx\\
    s.t. Ax \leq b
  \end{align*}
  \begin{align*}
    \min_{x} c^Tx\\
    s.t. diag(Ax-b) \preceq 0 \iff\\
    \sum_i x_i F_i \preceq 0
  \end{align*}

  SOCP $\iff$ SDP:
  \begin{align*}
    \min_x f^Tx\\
    s.t. \norm{Ax+b}_2 \leq c^Tx + d
  \end{align*}
  \begin{align*}
    \min_x f^Tx\\
    s.t.
    \begin{bmatrix}
      (c^Tx+d)I & Ax+b\\
      (Ax+b)^T & c^Tx+d
    \end{bmatrix} \succeq 0
  \end{align*}

  use Schur Complement:\\
  \begin{align*}
    X = \begin{bmatrix}
      A & B \\
      B^T & C
    \end{bmatrix}\\
    X \succeq 0 \iff A \succeq 0 \wedge C-B^TA^{-1}C \succeq 0\\
    X \succeq 0 \iff C \succeq 0 \wedge A-BC^{-1}B^T \succeq 0
  \end{align*}

  \begin{align*}
    let\ X=
    \begin{bmatrix}
      (c^Tx+d)I & Ax+b\\
      (Ax+b)^T & c^Tx+d
    \end{bmatrix} \succeq 0 \iff\\
    \begin{cases}
      c^Tx+d \succeq 0 & \\
      c^Tx+d - \frac{(Ax+b)^TI(Ax+b)}{c^Tx+d} \succeq 0 &\\      
    \end{cases}\\
    (c^Tx+d)^2 \geq \norm{Ax+b}_2^2\\
    c^Tx+d \geq \norm{Ax+b}_2\\
  \end{align*}
  
  \vfill\null
  \columnbreak

  \subsection{Coordinate Desecent}
  \subsubsection{Unconstrained minimization problem}
  \begin{align*}
    \min_{x_1,x_2} f(x_1,x_2)\\
    assume\ x_1^*, x_2^*\ exist
  \end{align*}
  \begin{algorithm}[H]
    \Do{not until convergence}{
      fix $x_1$, min. over $x_2$\\
      fix $x_2$, min. over $x_1$
    }
    \caption{Coordnate Descent\label{algo:CoordDescent}}
  \end{algorithm}
  Convergence: yes\\
  Sequence of objective value is non-increasing\\
  Optimal value exists and bounded below\\
  CD algo. guaranteed to converge to a (local) minima whether or not objective function is convex\\
  global optimal: $
  \begin{cases}
    \text{No} , & \text{nonconvex obj}\\
    \text{Yes} , & \text{convex, differentiable obj}
  \end{cases}
  $\\

  Assume CD converges to $(x_1^*,x_2^*)$, $\nabla f$ exists.\\
  Piecing together:
  \begin{align*}
    \nabla_{x_1} f|_{x_1^*,x_2^*} = 0\\
    \nabla_{x_2} f|_{x_1^*,x_2^*} = 0\\
    \nabla f =
    \begin{bmatrix}
      \nabla_{x_1} f\\
      \nabla_{x_2} f
    \end{bmatrix}\bigg|_{x_1^*,x_2^*} = 0 \implies optimal
  \end{align*}
  Non-differentiable objective function, iterate may get stuck.
  \subsubsection{Constrained problem}
  CD works if constraints are separable\\
  eg:
  \begin{align*}
    \min_{x_1,x_2} f(x_1,x_2)\\
    s.t. 
    \begin{bmatrix}
      A & 0\\
      0 & F
    \end{bmatrix}
    \begin{bmatrix}
      x_1\\x_2
    \end{bmatrix} \leq
    \begin{bmatrix}
      b\\
      g
    \end{bmatrix}
  \end{align*}

  $x_1,x_2$ decoupled, separable\\
  Solve separate problems, concatenate results\\

  At optimality:\\
  \begin{align*}
    &\nabla_{x_1} f(x_1^*,x_2^*) + A^T \lambda_1 = 0\\
    &\nabla_{x_2} f(x_1^*,x_2^*) + F^T \lambda_2 = 0\\
    &\implies\\
    &\nabla_x f(x_1^*,x_2^*) +
    \begin{bmatrix}
      A^t & 0 \\
      0 & F^T
    \end{bmatrix}
    \begin{bmatrix}
      \lambda_1\\
      \lambda_2
    \end{bmatrix}\\
    & = \nabla_x f(x_1^*,x_2^*) + \tilde{A}^T \lambda = 0
  \end{align*}

  CD works for separable constraints due to:
  \begin{itemize}
  \item convergence guarantee
  \item at convergence, KKT condition of orig. problem satisfied
  \end{itemize}

  \vfill\null
  \columnbreak
  
  \subsection{Sequential Quadratic Programming (SQP)}
  
  $\min_x f_0(x)$, $f_0$ nonconvex, differentiable

  Idea: per iteration, approximate $f_0(x)$ by a convex function (eg: quadratic)\\
  Solves for a local optimum\\

  At point $x$:
  \begin{align*}
    f_0(x+v) \approx f_0(x) + \nabla f_0(x)^T v + \frac{1}{2} v^T \nabla^2 f_0(x) v\\
    \min_v \nabla f_0(x)^T v + \frac{1}{2} v^T \nabla f_0^2(x) v\\
    \Delta x = v^* \implies x = x + \alpha \Delta x, \alpha\ via\ line\ search
  \end{align*}

  Constrained problem:\\
  Idea: form Lagrandian $L(x,\lambda,v)=f_0(x)+\sum_i \lambda_i f_i(x) + \sum_i v_i h_i(x)$
  \begin{align*}
    \min_x L(x,\lambda,v)\\
    s.t. f_i(x) \leq 0\\
    h_i(x) = 0
  \end{align*}
  
  At $\lambda,v$ optimal dual dual variable
  \begin{align*}
    \min_v (\nabla_x L)^T v + \frac{1}{2}v^T(\nabla_x^2L)v\\
    s.t. \nabla f_i(x)^Tv + f_i(x) \leq 0\\
         \nabla h_i(x)^Tv + h_i(x) \leq 0
  \end{align*}

  Updating $\lambda,v$?\\
  Select $\Delta \lambda, \Delta v$ in direction of subgradient\\

  For fixed $\lambda, v$: $x^{(k+1)} \leftarrow x + \alpha \Delta x, \Delta x = v$
  
  Algo:\\
  \begin{algorithm}[H]
    \Do{not until convergence}{
      solve QP to obtain $(\Delta x, \Delta \lambda, \Delta v)$\\
      line search $(x,\lambda,v)=(x,\lambda,v)+\alpha(\Delta x,\Delta \lambda, \Delta v)$\\
      reapproximate by QP
    }
    \caption{Sequential Quadratic Programming\label{algo:SQP}}
  \end{algorithm}
  
  \vfill\null
  
  \pagebreak

  \section{Augmented Lagrangian}

  Given:
  \begin{align*}
    & \min_x f(x)\\
    & s.t.\ Ax = b
  \end{align*}

  Add quadratic term and form the Lagrangian:
  \begin{align*}
    \max_y \min_x L(x,y) = & \max_y \min_x f(x) + y^T(Ax-b)\\
                           & + \frac{\rho}{2}\norm{Ax-b}_2^2
  \end{align*}

  Optimality conditions:
  \begin{align*}
    & Ax^* - b = 0\\
    & \partial f(x^*) + A^Ty^* = 0
  \end{align*}

  step size parameter for dual variable:
  \begin{align*}
    &\nabla_x L(x^{k+1},y^k) = 0\\
    &\partial f(x^{k+1}) + A^T y^k + \rho A^T(Ax^{k+1}-b)=0\\
    &\partial f(x^{k+1}) + A^T( y^k + \rho(Ax^{k+1}-b))=0\\
    &y^{k+1} = y^k + \rho(Ax^{k+1}-b)\\
    &\partial f(x^{k+1}) + A^T y^{k+1}=0
  \end{align*}
  
  Updates:
  \begin{align*}
    x^{k+1} = & \argmin_x f(x) + {y^{k}}^T(Ax-b)\\
                           & + \frac{\rho}{2}\norm{Ax-b}_2^2\\
    y^{k+1} = & y^k + \rho(Ax^{k+1}-b)
  \end{align*}

  \vfill\null
  \columnbreak
  
  \section{ADMM}

  Form augmented Lagrangian, minimize wrt. primal variables, maximize dual problem, use coordinate descent for update. Example, for constrained minimization:
  \begin{align*}
    & \min_x g(x) + g(x)\\
    & s.t. Ax + Bx = c\\
    & L_p = f(x) + g(z) + y^T(Ax+Bz-c)\\
    & + \frac{\rho}{2} ||Ax+Bz-c||_2^2
  \end{align*}
  
  If multiple rounds of update of primal variable are done before dual variable update, then it is equivalent to method of multipliers where joint optimization is done:
  \begin{align*}
    &x^{k+1}, z^{k+1} = \argmin_{x,z} L_p(x,z,y^k)\ [descent]\\
    &y^{k+1} = \argmax_{y} L_p(x^{k+1},z^{k+1},y)\ [ascent]
  \end{align*}
  ADMM update:
  \begin{align*}
    x^{k+1} = & \argmin_{x} L_p(x,z^k,y^k)\\
    z^{k+1} = & \argmin_{z} L_p(x^{k+1},z^k,y^k)\\
    y^{k+1} = & \argmax_{y} L_p(x^{k+1},z^{k+1},y)\\
              = & y^k + \rho(Ax^{k+1} + Bz^{k+1} -c)
  \end{align*}

  alternative formulation with scaled form of ADMM, with the same constrained optimization above:
  \begin{align*}
    L_p = & f(x) + g(z) + y^T(Ax+Bz-c)\\
          & + \frac{\rho}{2} ||Ax+Bz-c||_2^2\\
    r= & Ax+Bz-c\\
           u = & \frac{y}{\rho}\\
           L_p = & f(x) + g(z) + y^T r + \frac{\rho}{2} ||r||_2^2\\
    y^T r + \frac{\rho}{2} ||r||_2^2 = & \frac{\rho}{2}( \frac{2}{\rho} y^T r + r^T r)\\
    = & \frac{\rho}{2}(r^T r + \frac{2 y^T r}{\rho} + \frac{ y^T y}{\rho ^2}) - \frac{y^T y}{2\rho}\\
    = & \frac{\rho}{2} ||r+\frac{y}{\rho}||_2^2 - \frac{1}{2 \rho} ||y||_2^2\\
    = & \frac{\rho}{2} ||r+u||_2^2 - \frac{\rho}{2}||u||_2^2\\
    = & \frac{\rho}{2} (||r+u||_2^2 - ||u||_2^2)
  \end{align*}

  \vfill\null
  \columnbreak
  
  \begin{align*}
    L_p = & f(x) + g(z) + \frac{\rho}{2} (||r+u||_2^2 - ||u||_2^2)\\
    x^{k+1} = & \argmin_{x} L_p = \argmin_{x} f(x)\\
          & + \frac{\rho}{2}(||r+u||)_2^2-||u||_2^2)\\
    \frac{\partial u}{\partial x} = & 0 \implies\\
    x^{k+1} = & \argmin_{x} f(x) + \frac{\rho}{2}||r+u||_2^2\\
    = & \argmin_{x} f(x) + \frac{\rho}{2}||Ax + Bz^k - c + u^k||_2^2\\
    z^{k+1} = & \argmin_{z} L_p = \argmin_{z} g(z)\\
          & + \frac{\rho}{2}(||r+u||_2^2-||u||_2^2)\\
    \frac{\partial u}{\partial z} = & 0 \implies\\
    z^{k+1} = & \argmin_{z} g(z) + \frac{\rho}{2}||r+u||_2^2\\
          = & \argmin_{z} g(z) + \frac{\rho}{2}||Ax^{k+1} + Bz - c + u^k||_2^2\\
    y^{k+1} = & y^k + \rho (Ax^{k+1} +Bz^{k+1} - c)\\
    \rho u^{k+1} = & \rho u^k + \rho (Ax^{k+1} +Bz^{k+1} - c)\\
    u^{k+1} = & u^k + Ax^{k+1} +Bz^{k+1} - c
  \end{align*}

  thus, ADMM scaled form update becomes:
  \begin{align*}
    x^{k+1} = & \argmin_{x} f(x) + \frac{\rho}{2}||Ax + Bz^k - c + u^k||_2^2\\
    z^{k+1} = & \argmin_{z} g(z) + \frac{\rho}{2}||Ax^{k+1} + Bz - c + u^k||_2^2\\
    u^{k+1} = & u^k + Ax^{k+1} +Bz^{k+1} - c
  \end{align*}

  residual at iteration k:
  \begin{align*}
    &r^k = Ax^k + Bz^k - c
  \end{align*}
  sum of residuals:
  \begin{align*}
    &u^k = u^0 + \sum_{j=1}^{k} r^j\\
    &u^{k+1} = u^k + r^{k+1}
  \end{align*}


  \vfill\null
  \columnbreak
  
  Example for constrained convex set minimization:
  \begin{align*}
    &\min_x f(x)\\
    &s.t.\ x \in C
  \end{align*}

  augmented Lagrangian and scaled form ADMM
  \begin{align*}
    &\min_x f(x) + g(z)\\
    &g(z) = I_C(z)\\
    &s.t.\ x-z=0
  \end{align*}
  \begin{align*}
    r=&x-z\\
    L_p(x,z,y) =& f(x) + g(z) + y^T(x-z) + \frac{\rho}{2} \norm{x-z}_2^2\\
               =& f(x) + g(z) + \frac{\rho}{2}(r^T r + \frac{2 y^T r}{\rho} + \frac{y^T y}{\rho^2})\\
                & - \frac{y^T y}{2 \rho}\\
    =& f(x) + g(z) + \frac{\rho}{2} \norm{r+\frac{y}{\rho}}_2^2 - \frac{\norm{y}_2^2}{2\rho}\\
    u=& \frac{y}{\rho}\\
    L_p(x,z,y) =& f(x) + g(z) + \frac{\rho}{2}(\norm{r+u}_2^2 - \norm{u}_2^2)\\
    x^{k+1} =& \argmin_x f(x) + \frac{\rho}{2} \norm{r^k+u^k}_2^2\\
    x^{k+1} =& \argmin_x f(x) + \frac{\rho}{2} \norm{x-z^k+u^k}_2^2\\
    z^{k+1} =& \argmin_z g(z) + \frac{\rho}{2} \norm{x^{k+1}-z+u^k}_2^2\\
    z^{k+1} =& \argmin_z I_C(z) + \frac{\rho}{2} \norm{x^{k+1}-z+u^k}_2^2\\
    z^{k+1} =& \Pi_C(x^{k+1}+u^k)\ [projection\ to\ C]\\
    \rho u^{k+1} =& \rho u^k + \rho (x^{k+1} - z^{k+1})\\
    u^{k+1} =& u^k + x^{k+1} - z^{k+1}
  \end{align*}

  \vfill\null
  \columnbreak
  
  \subsection{Convergence}
  \begin{align*}
    &\text{1. f and g convex, closed, proper}\\
    &\text{2. unaugmented Lagrandgian has a saddle point}\\
    &\implies\\
    &\text{1. residual convergence as }k \rightarrow \infty\\
    &\text{2. objective convergence as}k \rightarrow \infty\\
    &\text{3. dual variable convergence as}k \rightarrow \infty
  \end{align*}
  
\subsection{Optimality and Stopping Conditions}
  use primal and dual residual as proxy for stopping\\
  \begin{align*}
    &f(x^k)+g(z^k)-p^* \leq -{y^k}^T r^k + (x^k-x^*)^T s^k\\
    &\norm{x^k-x^*}_2 \leq d\\
    &f(x^k)+g(z^k)-p^* \leq -{y^k}^T r^k + d \norm{s^k}_2\\
    &f(x^k)+g(z^k)-p^* \leq \norm{y^k}_2 \norm{r^k}_2 + d \norm{s^k}_2
  \end{align*}

  make residuals small:
  \begin{align*}
    &\norm{f^k}_2 \leq \epsilon^{pri}\\
    &\norm{s^k}_2 \leq \epsilon^{dual}
  \end{align*}

  Eg:
  \begin{align*}
    &\epsilon^{pri}=\sqrt{p} \epsilon^{abs} + \epsilon^{rel} max(\norm{Ax^k}_2, \norm{Bz^k}_2, \norm{c}_2)\\
    &\epsilon^{dual}=\sqrt{n} \epsilon^{abs} + \epsilon^{rel} \norm{A^T y^k}_2
  \end{align*}
    
  \vfill\null
  \columnbreak

  \subsection{L1 problems}
  
  \subsubsection{least absolute deviation}
  \begin{align*}
    &\min_x \norm{Ax-b}_1
  \end{align*}
  \begin{align*}
    &z=Ax-b\\
    &\min_{x,z} f(x) + g(z)\\
    &s.t.\ Ax-b-z=0\\
    &g(z) = \norm{z}_1\\
    &f(x) = 0\\
    &L(x,z,y)=\norm{z}_1 + y^T(Ax-b-z)\\
    &+ \norm{Ax-b-z}_2^2\\
    &x^{k+1}=\argmin_x L(x,z^k,y^k)\\
    &\frac{\partial L}{\partial x}=A^T y + \rho A^T(Ax-b-z)=0\\
    &x^{k+1}=(A^T A)^{-1}A^T(b+z^k-u^k)\\
    &z^{k+1}=\argmin_z L(x^{k+1},z,y^k)\\
    &0 \in \partial g(z) -y^k - \rho(Ax^{k+1}-b-z)\\
    &0 \in \frac{1}{\rho} \partial g(z) -\frac{y^k}{\rho} - (Ax^{k+1}-b-z)\\
    &0 \in \frac{1}{\rho} \partial g(z) - (Ax+u-b-z)\\
    &z^{k+1} = \argmin_z g(z) + \frac{\rho}{2}\norm{Ax^{k+1}+u^k-b-z}_2^2\\
    &z^{k+1} = prox_{||*||_1, \frac{1}{\rho}}(Ax^{k+1}+u^k-b)\\
    &y^{k+1}=y^k + Ax^{k+1}-b-z^{k+1}
  \end{align*}

  \subsubsection{Huber fitting}
  \begin{align*}
    &\min f_{huber}(Ax-b)\\
    &f_{huber}(v) =
    \begin{cases}
      \frac{v^2}{2} &, v \leq 1\\
      \norm{v}_1 - \frac{1}{2} &, v \ge 1
    \end{cases}
  \end{align*}
  use proximal operator for Huber function instead of L1 when updating $z$:
  \begin{align*}
    z^{k+1}=&\frac{\rho}{1+\rho}(Ax^{k+1}-b+u^k)\\
            &+\frac{1}{1+\rho}ST_{1+\frac{1}{\rho}}(Ax^{k+1}-b+u^k)\\
    ST &:= \text{SoftThresholding}
  \end{align*}

  \subsubsection{general L1 regularized loss}
  \begin{align*}
    \min_x l(x) + \lambda \norm{x}_1,\ l\ convex
  \end{align*}

  Lagrangian and ADMM:
  \begin{align*}
    &g(x)=\lambda \norm{x}_1\\
    &min_{x,z} l(x) + g(z)\\
    &s.t.\ x-z=0\\
    &L(x,z,y) = l(x) + g(z) + y^T (x-z)+\frac{\rho}{2}\norm{x-z}_2^2\\
    &x^{k+1}=\argmin_x L(x, z^k, y^k)\\
    &0 \in \partial l(x) + y + \rho(x-z)\\
    &u=\frac{y}{\rho}\\
    &x^{k+1}=\argmin_x l(x) + \frac{\rho}{2}\norm{x-z^k-u^k}_2^2\\
    &z^{k+1}=\argmin_z L(x^{k+1}, z, y^k)\\
    &0 \in = \partial g(z) - y^T + \rho (x^{k+1}-z)(-1)\\
    &z^{k+1}=\argmin_z \lambda \norm{z}_1 + \frac{\rho}{2}\norm{x^{k+1}-z+u}_2^2\\
    &z^{k+1}=\argmin_z \norm{z}_1 + \frac{\rho}{2\lambda}\norm{x^{k+1}+u-z}_2^2\\
    &z^{k+1}=prox_{\norm{*}_1, \frac{\lambda}{\rho}}(x^{k+1}+u^k)\\
    &z^{k+1}=\text{SoftThresholding}_{\frac{\lambda}{\rho}}(x^{k+1}+u^k)\\
    &u^{k+1} = u^k + x^{k+1} - z^{k+1}
  \end{align*}

  Thus, this problem is reduced to solving a series of L2 regularized loss.

  If $l(x)$ is smooth, various 1st and 2nd order methods can be used: L-BFGS, Newton, Quasi-Newton, Conjugate Gradient.

  \vfill\null
  \columnbreak
  
  \subsubsection{L1 regularized linear regression (Lasso)}
  \begin{align*}
    &\min_x f(x) + g(x)\\
    &f(x) = \frac{1}{2}\norm{Ax-b}_2^2\\
    &g(x)=\lambda\norm{x}_1
  \end{align*}
  ADMM:
  \begin{align*}
    &\min_x f(x) + g(z)\\
    &s.t.\ x-z=0\\
    &L(x,z,y)=f(x) + g(z) + y^T(x-z) + \frac{\rho}{2}\norm{x-z}_2^2\\
    &x^{k+1} = \argmin_x L(x,z^k,y^k)\\
    &A^T(Ax-b) + y^k + \rho (x-z^k)=0\\
    &x = (A^TA+\rho I)^{-1}(A^Tb + \rho (z^k-u^k))\\
    &z^{k+1} = \argmin_z L(x^{k+1},z,y^k)\\
    &0 \in \partial g(z) +\rho(x-z+u)(-1)\\
    &z^{k+1} = \argmin_z \lambda\norm{z}_1 + \frac{\rho}{2}\norm{x^{k+1}+u^k-z}_2^2\\
    &z^{k+1} = prox_{\norm{*}_1, \lambda/\rho}(x^{k+1}+u^k)\\
    &z^{k+1} = \text{SoftThresholding}_{\lambda/\rho}(x^{k+1}+u^k)\\
    &u^{k+1} = u^k+x^{k+1}-z^{k+1}
  \end{align*}  

  \vfill\null
  \columnbreak
  
  \subsubsection{generalized lasso}
  \begin{align*}
    &\min_x f(x) + g(x)\\
    &f(x) = \frac{1}{2}\norm{Ax-b}_2^2\\
    &g(x)=\lambda\norm{Fx}_1
  \end{align*}
  
  ADMM:
  \begin{align*}
    &\min_x \frac{1}{2}\norm{Ax-b}_2^2 + \lambda \norm{Fx}_1\\
    &z=Fx\\
    &\min_{x,z} \frac{1}{2}\norm{Ax-b}_2^2 + \lambda \norm{z}_1\\
    &s.t.\ Fx-z=0\\
    &L(x,z,y) = \frac{1}{2}\norm{Ax-b}_2^2 + \lambda \norm{z}_1 + y^T(Fx-z)\\
    &+ \frac{\rho}{2}\norm{Fx-z}_2^2\\
    &\frac{\partial L}{\partial x} = A^T(Ax-b) + F^T y + \rho F^T(Fx-z)=0\\
    &A^T Ax + \rho F^TFx = A^T b - F^T y + \rho F^T z\\
    &x^{k+1} = (A^T A + \rho F^T F)^{-1}(A^T b + \rho F^T (z^k-u^k))\\
    &\frac{\partial L}{\partial z} = \partial (\lambda \norm{z}_1) + \rho(Fz-z+u)(-1)\\
    &z^{k+1} = \lambda \norm{z}_1 + \frac{\rho}{2} \norm{Fx-z+u}_2^2
    &z^{k+1} = prox_{\norm{*}_1, \lambda/\rho} (Fx+u)
  \end{align*}
    
  Special case for total variation denoising:
  \begin{align*}
    &A=I\\
    &F=\text{1st order difference matrix}\\
    &F_{ij}=
      \begin{cases}
        1 &, i+1=j\\
        -1&, i=j\\
        0&, o/w
      \end{cases}
  \end{align*}
  
  TODO: dual feasibility
  
  TODO: computation shortcuts
  
  Reference: Distributed Optimization and Statistical Learning via the Alternating Direction Method of Multipliers [Boyde et al.]
  
  \vfill\null
  
  \pagebreak

  \section{Subgradient Methods}

  todo

  \vfill\null
  
  \pagebreak
  
  \section{Appendix}

  \subsection{Gradient of Log Det}
  $f(x)=log(det X)$\\
  \begin{align*}
    f(X+\delta X) &= logdet(X+\delta X)\\
                  &= logdet((X^{\frac{1}{2}}(I+X^{-\frac{1}{2}}) \delta X X^{-\frac{1}{2}})X^{\frac{1}{2}})\\
                  &= logdet(X)+logdet(I+X^{-\frac{1}{2}} \delta X X^{-\frac{1}{2}})\\
                  & \text{let } M = X^{-\frac{1}{2}} \delta X X^{-\frac{1}{2}}\\
                  &= logdet(X)+logdet(I+M)
  \end{align*}
  claim eigenvalues of $I+M$: $1+\lambda_i$
  \begin{align*}
    Mv_i &= \lambda_i v_i\\
    (I+M)v_i &= (1+\lambda_i)v_i\\
    det(M)&=\prod_i(1+\lambda_i)\\
    f(X+\delta X) &= logdet(X) + log \prod_i(1+\lambda_i)\\
         &= logdet(X) + \sum_i log(1+\lambda_i)\\
         &\approx logdet(X) + \sum_i \lambda_i \text{ since } \delta X \text{ is small}\\
         &\approx logdet(X) + trace(X^{-\frac{1}{2}} \delta X X^{\frac{1}{2}})\\
         &\approx logdet(X) + trace(X^{-1} \delta X)\\
    trace(X^{-1} \delta X) &= (X^{-T})^T \delta X\\
    f(X+\delta X) &= f(X) + (\nabla f(X))^T \delta X \implies \nabla f(X) = X^{-1}\\
    logdet(X) &= log(X) \implies f'(X)=\frac{1}{X}
  \end{align*}
    
  \subsection{2nd order approximation of Log Det}
  \begin{align*}
    f(X+\delta X) = f(X) + <\nabla f(X), \delta X>  + 1/2 <\delta X, \nabla^2 f(x) \delta X>
  \end{align*}
  first look at first order approximation: $g(X) = X^{-1}$
  \begin{align*}
    g(X+\delta X) &= (X+\delta X)^{-1} = (X^{\frac{1}{2}}(I+X^{-\frac{1}{2}} \delta X X^{-\frac{1}{2}}) X^{\frac{1}{2}})^{-1}\\
                  &= X^{-\frac{1}{2}}(I+X^{-\frac{1}{2}} \delta X X^{-\frac{1}{2}})^{-1} X^{-\frac{1}{2}}\\
                  &\text{for small A(small eigenvalues): } (I+A)^{-1} \approx I-A\\
                  &= X^{-\frac{1}{2}}(I-X^{-\frac{1}{2}} \delta X X^{-\frac{1}{2}}) X^{-\frac{1}{2}}\\
                  &= X^{-1} - X^{-1} \delta X X^{-1}
  \end{align*}
  \begin{align*}
    logdet(X+\delta X)&=logdet(X)+tr(X^{-1}\delta X) - \frac{1}{2} tr(\delta X X^{-1} \delta X X^{-1})
  \end{align*}
  
  \pagebreak
  
  \section{Miscellaneuous Properties}
  $(a+x)^{-1} \approx 1-x$\\
  $\lim_{t \to 0} \frac{f(x+\epsilon t) - f(x)}{t} = \ppartial{f(x)}{x} \epsilon$\\

  \subsection{Pseudo-inverse}
  Overconstrained case:\\
  Cast as L2 norm approximation problem
  \begin{align*}
    &\min_x \norm{Ax-b}_2^2
  \end{align*}
  \begin{align*}
    &(Ax-b)^T(Ax-b)=x^TA^TAx -2x^TA^Tb +b^Tb\\
    &\frac{\partial}{\partial x}(x^TA^TAx -2x^TA^Tb +b^Tb)=2A^T Ax - 2A^Tb\\
    &x = (A^T A)^{-1} A^Tb\\
  \end{align*}
  Underconstrained case:\\
  Cast as a least-norm problem w/ equality constraint
  \begin{align*}
    &\min_x \norm{x}_2^2\\
    &s.t.:\ Ax=b
  \end{align*}
  \begin{align*}
    &\min_x L(x,\lambda,v) = x^Tx -v^T(Ax-b)\\
    &\ppartial{(x^Tx -v^T(Ax-b))}{x} = 2x-A^Tv=0\\
    &x=-\frac{1}{2} A^Tv\\
    &g(\lambda,v) = [x^Tx -v^T(Ax-b)]_{x=-\frac{1}{2} A^Tv}\\
    &g(\lambda,v) = -\frac{1}{4}v^TAA^Tv - v^Tb\\
    &dual:\ \max_{\lambda,v} g(\lambda,v)=-\min_{\lambda,v} g(\lambda,v)\\
    &\ppartial{(\frac{1}{4}v^TAA^Tv + v^Tb)}{v}=0\\
    &v=\frac{1}{2}AA^Tv+b=0\\
    &v=-2(AA)^{-1}b\\
    &x=-\frac{1}{2} A^Tv|_{v=-2(AA)^{-1}b}= A^T(AA^T)^{-1}b\\
    % &b=A(x+x'), \forall x' \in Nullspace(A)\\
    % &x^*=A^T(AA^T)^{-1}Y \implies Ax^* = AA^T(AA^T)^{-1}Y\\
    % &check:\\
    % &(x-x^*) \perp x^*, \forall x \in x_0 + Nullspace(A)\\
    % &(x-x^*)^Tx^* = (x-x^*)^TA^T(AA^T)^{-1}Y\\
    % &(A(x-x^*))^T(AA^T)^{-1}Y = 0\\
  \end{align*}

  \pagebreak

  \section{Problems}
  \subsection{min. vol. covering ball}
  Find minimal volume norm ball $B$ covering $B_j, \forall j$, where $B_j$ is a norm ball. Consider 2-norm:\\

  Let $c_j, r_j$ be center, radius of norm ball $B_j$\\
  Let $c, r$ be center, radius of $B$
  \begin{align*}
    &\min_{c,r}\ r\\
    &s.t. \norm{|c-c_j|+r_j}_2 \leq r, \forall j\\
    &c^c + (-c_j+r_j)^T(-c_j+r_j) + 2c^T(-c_j+r_j) \leq r, \forall j\\
    &c^c + (c_j+r_j)^T(c_j+r_j) + 2c^T(c_j+r_j) \leq r, \forall j
  \end{align*}
  QCQP problem\\

  Find minimal volume norm ball $B$ covering $B_j, \forall j$, where $B_j$ is a norm ball. Consider $\infty$-norm:
  \begin{align*}
    &\min_{c,r}\ r\\
    &s.t. \norm{|c-c_j|+r_j}_{\infty} \leq r, \forall j\\
    &c-c_j+r_j \leq r, \forall j\\
    &-c+c_j+r_j \leq r, \forall j
  \end{align*}
  LP problem

  \subsection{polyhedron intersection}

  $D=\set{x\in\R^n: Cx \leq d}$\\
  $G=\set{x\in\R^n: Cx \leq d}$\\
  Solve $D \cap G = \O$? using LP:
  \begin{align*}
    &\min_{a,b,x}\ 0\\
    &s.t. a^Tx=b\\
    &Cx-d \not \leq 0\\
    &Hx-g \not \leq 0\\
    \\
    &\min_{a,b,x,e}\ \sum_i e_i\\
    &s.t. a^Tx = b\\
    &Cx-d+e\leq 0\\
    &Hx-g+e\leq 0\\
    &e\leq 0
  \end{align*}

  $f^*$ feasible and $f^* < 0 \implies D \cap G = \O$\\
  otherwise, $\implies D \cap G \not= \O$\\

  Solve $D \subseteq G $? using LP:\\
  \begin{algorithm}[H]
    $verts$ of D $\leftarrow$ solve intersection of halfspace equations of D\\
    \For{$\forall vert \in D$}{
      $min\ 0$\\
      s.t. $Hv \leq g$\\
      $f^* =
      \begin{cases}
        NaN \implies return D \not\subseteq G\\
        o/w \implies continue
      \end{cases}
      $
    }
    return $D \subseteq G$
    \caption{Descent Overview\label{Descent}}
  \end{algorithm}  

\end{multicols*}

\end {document}

\message{ !name(notebook.tex) !offset(-2234) }