// ==========================================
// 核心修复：声明支持 ACES 2.0 非参数化 IDT 转换
// ==========================================
DEFINE_ACES_PARAM(IS_PARAMETRIC_ACES_TRANSFORM: 0, IS_IDT: 1, ACES_VERSION: 2)

typedef struct
{
	float3 r0, r1, r2;
} mat3;

__DEVICE__ mat3 make_mat3(float3 A, float3 B, float3 C);
__DEVICE__ float3 mult_f3_f33 (float3 In, mat3 A);
__DEVICE__ mat3 mult_f33_f33(mat3 A, mat3 B);
__DEVICE__ float dlog2_to_lin(float in);
__DEVICE__ float3 dlog2_to_lin3(float3 in);

__DEVICE__ mat3 make_mat3(float3 A, float3 B, float3 C)
{
	mat3 out;
    out.r0 = A;
    out.r1 = B;
    out.r2 = C;
    return out;
}

__DEVICE__ float3 mult_f3_f33 (float3 In, mat3 A)
{
    float out[3];
    float in[3] = {In.x, In.y, In.z};
    float a[3][3] = {{A.r0.x, A.r0.y, A.r0.z},
	  {A.r1.x, A.r1.y, A.r1.z},
	  {A.r2.x, A.r2.y, A.r2.z}};
    for( int i = 0; i < 3; ++i)
	{
        out[i] = 0.0f;
        for( int j = 0; j < 3; ++j)
        {
            out[i] = out[i] + in[j] * a[i][j];
        }
    }
    return make_float3(out[0], out[1], out[2]);
}

__DEVICE__ mat3 mult_f33_f33(mat3 A, mat3 B)
{
    float m[3][3];
    float a[3][3] = {{A.r0.x, A.r0.y, A.r0.z},
                    {A.r1.x, A.r1.y, A.r1.z},
                    {A.r2.x, A.r2.y, A.r2.z}};
    float b[3][3] = {{B.r0.x, B.r0.y, B.r0.z},
                    {B.r1.x, B.r1.y, B.r1.z},
                    {B.r2.x, B.r2.y, B.r2.z}};

    for( int i = 0; i < 3; ++i)
    {
        for( int j = 0; j < 3; ++j)
        {
            m[i][j] = 0.0f;
            for( int k = 0; k < 3; ++k)
            {
                m[i][j] = m[i][j] + a[i][k] * b[k][j];
            }
        }
    }
    mat3 M = make_mat3(make_float3(m[0][0], m[0][1], m[0][2]),
    make_float3(m[1][0], m[1][1], m[1][2]), make_float3(m[2][0], m[2][1], m[2][2]));
    return M;
}

#define dgamut2_to_xyz make_mat3(make_float3(0.6917f, 0.1596f, 0.0990f), make_float3(0.2498f, 0.8381f, -0.0880f), make_float3(0.0f, 0.0f, 1.0891f))
#define d65_to_d60_cat02 make_mat3(make_float3(1.0120f, 0.0080f, -0.0158f), make_float3(0.0058f, 1.0014f, -0.0063f), make_float3(-0.0003f, -0.0010f, 0.9276f))
#define xyz_to_acesAP0 make_mat3(make_float3(1.0498f, 0.0f, -0.0001f), make_float3(-0.4959f, 1.3733f, 0.0982f), make_float3(0.0f, 0.0f, 0.9913f))

// ACES 2.0 引入的 AP0 到 AP1 (ACEScg) 补正矩阵，确保色彩科学对齐
#define acesAP0_to_acesAP1 make_mat3(make_float3(1.4514393161f, -0.2365107469f, -0.2149285693f), make_float3(-0.0765537734f, 1.1762296998f, -0.0996759264f), make_float3(0.0083161484f, -0.0060324498f, 0.9977163014f))

// 重新组合矩阵管线
#define dgamut2_to_AP0 mult_f33_f33(xyz_to_acesAP0, mult_f33_f33(d65_to_d60_cat02, dgamut2_to_xyz))
#define dgamut2_to_AP1 mult_f33_f33(acesAP0_to_acesAP1, dgamut2_to_AP0)

__DEVICE__ float dlog2_to_lin(float in)
{
    float H = 475.0f;
    float a = 16.285770761945304f;
    float k1 = 0.059439938321493f;
    float b1 = 0.304985337243402f;
    float k2 = 2.960935245492250f;
    float b2 = 0.148314799066323f;
    float in_limit1 = 0.18f;
    float in_limit2 = 0.028961695254132f;
    float in_limit_rev_1 = 0.304985337243402f;
    float in_limit_rev_2 = 0.148314799066323f;
	
    float out = in >= in_limit_rev_1 ? (H / (_exp2f(a) - 1.0f ) * (_exp2f(a * in)-1)) : (in >= in_limit_rev_2 ? (_exp2f((in - b1) / k1  + _log2f(in_limit1))) : ((in - b2) / k2 + in_limit2));
    
    return out;
}

__DEVICE__ float3 dlog2_to_lin3(float3 in)
{
    float3 out;
    out.x = dlog2_to_lin(in.x);
    out.y = dlog2_to_lin(in.y);
    out.z = dlog2_to_lin(in.z);
    return out;
}

__DEVICE__ float3 transform(int p_Width, int p_Height, int p_X, int p_Y, float p_R, float p_G, float p_B)
{
    float3 rgb = make_float3(p_R, p_G, p_B);
    rgb = dlog2_to_lin3(rgb);
    // 切换为输出至 AP1 矩阵
    rgb = mult_f3_f33(rgb, dgamut2_to_AP1);
    return rgb;
}