The CZAR (Composite Zero-Agnostic Return) loss function is designed to address limitations observed in previous loss formulations used for inference synthesis in returns prediction topics. This post documents the motivation, functional form, and recommended parameter settings for CZAR loss. This function will replace ZPTAE in returns predictions topics.
Full loss function, including gradient and hessian for model training
def derivative(x):
return 1.0 / (1.0 + x**2)
def antiderivative(x):
return np.arctan(x)
def double_derivative(x):
return 2.0 * np.abs(x) / (1.0 + x**2)**2
def eps_effective(eps, delta):
# Rescale epsilon so that 1 - loss(z_true, 0) / loss(0, epsilon) crosses zero at epsilon
if abs(delta) == 0:
return np.arctan(eps)
A = (1 + delta**2) * (antiderivative(eps + delta) - antiderivative(delta))
beta = delta / (1 + delta**2) # coefficient on eps_eff^2 in loss(0, eps_eff, 1)
# Solve beta*x^2 + x - A = 0 for positive x
return (-1 + np.sqrt(1 + 4 * beta * A)) / (2 * beta)
def softplus(x):
return np.maximum(x, 0.0) + np.log1p(np.exp(-np.abs(x)))
def norm_smooth(z_true, eps, delta, tau):
# Minimum value of the normalisation at z_true, set by the limit that loss(z_true,0)
# does not decrease as z_true increases.
# Simplified from: 1 - loss(z_true, 0) / loss(0, epsilon)
a = np.abs(z_true)
d2p1 = delta**2 + 1
num = d2p1 * (antiderivative(a + delta) - antiderivative(delta))
denom = eps + delta / d2p1 * eps**2
norm_min = 1.0 - num / denom
if tau <= 0:
# Hard transition
return np.maximum(norm_min, 0.0)
# Smooth transition when norm drops below zero
# Scale tau_eff by |norm_inf| so the asymptote is invariant across eps, delta
# Asymptotic value of norm_min as |z_true| -> inf
num_inf = d2p1 * (0.5*np.pi - antiderivative(delta))
norm_inf = 1.0 - num_inf / denom
tau_eff = np.abs(tau) * np.abs(norm_inf)
return softplus(norm_min / tau_eff) / softplus(1 / tau_eff)
def czar_loss(y_true, y_pred, std, mean=0, alpha=1, epsilon=1, tau=0.05):
"""
Composite Zero-Agnostic Return Loss
Asymmetric, piecewise function that is
* Linear (alpha=0) or quadratic (alpha>0) when y_pred has opposite sign to y_true
* Linear (alpha=0) or quadratic (alpha>0) when |y_pred| > |y_true|, with a decreasing gradient as |z_true| increases
* Arctangent transition from 0 < |y_pred| < |y_true|
Args:
y_true: True returns
y_pred: Predicted returns
std: Standard deviation of true returns
mean: Mean of true returns
alpha: MSE term constant (alpha=0 is linear only, alpha=1 is maximum gradient)
epsilon: Loss softening scale, in units of standard devation. Optimum is eps~1
tau: Scaling for softening hinge function
Returns:
Value of loss
"""
if alpha < 0 or alpha > 1:
raise ValueError(f'alpha must be between 0 and 1, got {alpha}')
z_true = (y_true - mean) / std
z_pred = (y_pred - mean) / std
s = np.where(z_true == 0, 1, np.sign(z_true))
s_pred = np.where(z_pred == 0, 1, np.sign(z_pred))
a = np.abs(z_true)
u = s * z_pred
# Apply horizontal shift to function for smooth change in gradient
# Alpha should be between 0 and 1. 1/sqrt(3) shifts to the peak of the hessian function
delta = alpha / np.sqrt(3)
d2p1 = delta**2 + 1
d_true = z_true + s * delta
d_pred = z_pred + s_pred * delta
h1 = d2p1 * double_derivative(delta)
h3 = d2p1 * double_derivative(d_true)
# Region 1: opposite sign (u <= 0): grad = -s + MSE term
# Constant so that the middle branch hits zero at z_pred = z_obs
C = s * d2p1 * (antiderivative(d_true) - antiderivative(s * delta))
L1 = 0.5 * h1 * z_pred**2 - s * z_pred + C
# Region 2: same sign, before threshold (0 < u <= a): grad = -s * antiderivative(z_pred)
# antiderivative(d_true) term so that the middle branch hits zero at z_pred = z_obs
L2 = s * d2p1 * (antiderivative(d_true) - antiderivative(d_pred))
# Region 3: past threshold (u > a): grad = s * derivative(z_obs) + MSE term
dz = z_pred - z_true
L3 = 0.5 * np.minimum(h3, h1) * dz**2 + s * d2p1 * derivative(d_true) * dz
# Softening term
if epsilon > 0:
eps_eff = eps_effective(epsilon, delta)
softening_0 = czar_loss(0, eps_eff, 1., epsilon=0, alpha=alpha)
norm = norm_smooth(z_true, eps_eff, delta, tau)
Lsoft = norm * softening_0
else:
Lsoft = 0
return np.where(u <= 0, L1, np.where(u <= a, L2, L3)) + Lsoft
def czar_gradient(y_true, y_pred, std, mean=0, alpha=1):
z_true = (y_true - mean) / std
z_pred = (y_pred - mean) / std
s = np.where(z_true == 0, 1, np.sign(z_true))
s_pred = np.where(z_pred == 0, 1, np.sign(z_pred))
a = np.abs(z_true)
u = s * z_pred
# Apply horizontal shift to function for smooth change in gradient
# Alpha should be between 0 and 1. 1/sqrt(3) shifts to the peak of the hessian function
delta = alpha / np.sqrt(3)
d2p1 = delta**2 + 1
d_true = z_true + s * delta
d_pred = z_pred + s_pred * delta
h1 = d2p1 * double_derivative(delta)
h3 = d2p1 * double_derivative(d_true)
# Region 1: opposite sign (u <= 0): grad = -s + MSE term
# Acutal gradient:
# G1 = h1 * z_pred - s
# Psuedo gradient for numerical stability:
G1 = h1 * z_pred - np.sign(z_true)
# Region 2: same sign, before threshold (0 < u <= a): grad = -s * derivative(z_pred)
G2 = -s * d2p1 * derivative(d_pred)
# Region 3: past threshold (u > a): grad = s * derivative(z_true) + MSE term
# Actual gradient:
# G3 = np.minimum(h3, h1) * (z_pred - z_true) + s * d2p1 * derivative(d_true)
# Psuedo gradient for numerical stability:
G3 = np.minimum(h3, h1) * (z_pred - z_true)
return np.where(u <= 0, G1, np.where(u <= a, G2, G3)) / std
def czar_hessian(y_true, y_pred, std, mean=0, alpha=1):
z_true = (y_true - mean) / std
z_pred = (y_pred - mean) / std
s = np.where(z_true == 0, 1.0, np.sign(z_true))
s_pred = np.where(z_pred == 0, 1.0, np.sign(z_pred))
a = np.abs(z_true)
u = s * z_pred
# Alpha should be between 0 and 1. 1/sqrt(3) shifts to the peak of the hessian function
delta = alpha / np.sqrt(3)
d2p1 = delta**2 + 1
d_true = s * (np.abs(z_true) + delta)
d_pred = s_pred * (np.abs(z_pred) + delta)
# Region 1: opposite sign (u <= 0): grad = -s + MSE term
h1 = d2p1 * double_derivative(delta)
H1 = np.full_like(d_pred, h1)
# Region 2: same sign, before threshold (0 < u <= a): grad = -s * derivative(z_pred)
# Actual hessian:
# H2 = double_derivative(d_pred) * d2p1
# Psuedo hessian for numerical stability
H2 = (1.0 + d_pred**2) * double_derivative(d_pred)
# Region 3: past threshold (u > a): grad = s * derivative(z_true) + MSE term
# Actual hessian:
# h3 = double_derivative(d_true) * d2p1
# Consistent with H2 psuedo hessian:
h3 = (1.0 + d_true**2) * double_derivative(d_true)
H3 = np.full_like(d_pred, np.minimum(h1, h3))
return np.where(u <= 0, H1, np.where(u <= a, H2, H3)) / std**2














