This paper was converted on www.awesomepapers.org from LaTeX by an anonymous user.
Want to know more? Visit the Converter page.

Logit Standardization in Knowledge Distillation

First Author
Institution1
Institution1 address
firstauthor@i1.org
   Second Author
Institution2
First line of institution2 address
secondauthor@i2.org
𝐯n=fT(𝐱n){\mathbf{v}}_{n}=f_{T}({\mathbf{x}}_{n})

,

𝐳n=fS(𝐱n){\mathbf{z}}_{n}=f_{S}({\mathbf{x}}_{n})

,

aT=𝐯¯n=1Kk=1K𝐯n(k)a_{T}=\overline{{\mathbf{v}}}_{n}=\frac{1}{K}\sum_{k=1}^{K}{\mathbf{v}}_{n}^{(k)}
aS=𝐳¯n=1Kk=1K𝐳n(k)a_{S}=\overline{{\mathbf{z}}}_{n}=\frac{1}{K}\sum_{k=1}^{K}{\mathbf{z}}_{n}^{(k)}
bT=σ(𝐯n)=[1Kk=1K(𝐯n(k)𝐯¯n)2]1/2b_{T}=\sigma({\mathbf{v}}_{n})=\left[{\frac{1}{K}\sum_{k=1}^{K}\left({\mathbf{v}}_{n}^{(k)}-\overline{{\mathbf{v}}}_{n}\right)^{2}}\right]^{1/2}
bS=σ(𝐳n)=[1Kk=1K(𝐳n(k)𝐳¯n)2]1/2b_{S}=\sigma({\mathbf{z}}_{n})=\left[{\frac{1}{K}\sum_{k=1}^{K}\left({\mathbf{z}}_{n}^{(k)}-\overline{{\mathbf{z}}}_{n}\right)^{2}}\right]^{1/2}
q(𝐯n)=softmax[(𝐯naT)/bT/τ]q({\mathbf{v}}_{n})=\text{\text{softmax}}\left[({\mathbf{v}}_{n}-a_{T})/b_{T}/\tau\right]
q(𝐳n)=softmax[(𝐳naS)/bS/τ]q({\mathbf{z}}_{n})=\text{\text{softmax}}\left[({\mathbf{z}}_{n}-a_{S})/b_{S}/\tau\right]

q(𝐳n)=softmax(𝐳n)q^{\prime}({\mathbf{z}}_{n})=\text{\text{softmax}}\left({\mathbf{z}}_{n}\right)

λKDτ2(q(𝐯n),q(𝐳n))\lambda_{KD}\tau^{2}{\mathcal{L}}\left(q({\mathbf{v}}_{n}),q({\mathbf{z}}_{n})\right)

Update fSf_{S} towards minimizing λCECE(yn,q(𝐳n))+λKDτ2(q(𝐯n),q(𝐳n))\lambda_{CE}{\mathcal{L}}_{CE}\left({{y}}_{n},q^{\prime}({\mathbf{z}}_{n})\right)+\lambda_{KD}\tau^{2}{\mathcal{L}}\left(q({\mathbf{v}}_{n}),q({\mathbf{z}}_{n})\right)

Input: Transfer set 𝒟{\mathcal{D}} with samples of image-label pair {𝐱n,yn}n=1N\{{\mathbf{x}}_{n},{{y}}_{n}\}_{n=1}^{N}, Number of classes KK, Base Temperature τ\tau, Teacher fTf_{T}, Student fSf_{S}, Loss {\mathcal{L}} (e.g., KL{\rm{KL}} divergence KL{\mathcal{L}}_{\rm{KL}})
Output: Trained student model fSf_{S}
1
2foreach  (𝐱n,yn)({\mathbf{x}}_{n},{{y}}_{n}) in 𝒟{\mathcal{D}} do
3       𝐯n=fT(𝐱n){\mathbf{v}}_{n}=f_{T}({\mathbf{x}}_{n}), 𝐯¯n=1Kk=1K𝐯n(k)\overline{{\mathbf{v}}}_{n}=\frac{1}{K}\sum_{k=1}^{K}{\mathbf{v}}_{n}^{(k)}
4      𝐳n=fS(𝐱n){\mathbf{z}}_{n}=f_{S}({\mathbf{x}}_{n}),  𝐳¯n=1Kk=1K𝐳n(k)\overline{{\mathbf{z}}}_{n}=\frac{1}{K}\sum_{k=1}^{K}{\mathbf{z}}_{n}^{(k)}
5      σ(𝐯n)=[1Kk=1K(𝐯n(k)𝐯¯n)2]1/2\sigma({\mathbf{v}}_{n})=\left[{\frac{1}{K}\sum_{k=1}^{K}\left({\mathbf{v}}_{n}^{(k)}-\overline{{\mathbf{v}}}_{n}\right)^{2}}\right]^{1/2}
6      σ(𝐳n)=[1Kk=1K(𝐳n(k)𝐳¯n)2]1/2\sigma({\mathbf{z}}_{n})=\left[{\frac{1}{K}\sum_{k=1}^{K}\left({\mathbf{z}}_{n}^{(k)}-\overline{{\mathbf{z}}}_{n}\right)^{2}}\right]^{1/2}
7      q(𝐯n)=softmax[(𝐯n𝐯¯n)/σ(𝐯n)/τ]q({\mathbf{v}}_{n})=\text{\text{softmax}}\left[({\mathbf{v}}_{n}-\overline{{\mathbf{v}}}_{n})/\sigma({\mathbf{v}}_{n})/\tau\right]
8      q(𝐳n)=softmax[(𝐳n𝐳¯n)/σ(𝐳n)/τ]q({\mathbf{z}}_{n})=\text{\text{softmax}}\left[({\mathbf{z}}_{n}-\overline{{\mathbf{z}}}_{n})/\sigma({\mathbf{z}}_{n})/\tau\right]
9      q(𝐳n)=softmax(𝐳n)q^{\prime}({\mathbf{z}}_{n})=\text{\text{softmax}}\left({\mathbf{z}}_{n}\right)
10      Update fSf_{S} towards minimizing λCECE(yn,q(𝐳n))+λKDτ2(q(𝐯n),q(𝐳n))\lambda_{CE}{\mathcal{L}}_{CE}\left({{y}}_{n},q^{\prime}({\mathbf{z}}_{n})\right)+\lambda_{KD}\tau^{2}{\mathcal{L}}\left(q({\mathbf{v}}_{n}),q({\mathbf{z}}_{n})\right)
11 end foreach
Algorithm 1 𝒵\mathcal{Z}-score logit standardization pre-process in knowledge distillation.