#include <ccv.h>
#include <ccv_internal.h>
#include <nnc/ccv_nnc.h>
#include <nnc/ccv_nnc_easy.h>
#include <nnc/ccv_nnc_internal.h>
#ifdef USE_OPENMP
#include <omp.h>
#endif
#ifdef USE_DISPATCH
#include <dispatch/dispatch.h>
#endif
#include "../_ccv_nnc_cpu_ref.h"
int _ccv_nnc_ewsum_forw_cpu_ref(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
{
if (input_size == 1 && output_size == 1)
{
_ccv_nnc_tensor_transfer_cpu_ref((const ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)outputs[0]);
return CCV_NNC_EXEC_SUCCESS;
}
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM + 2];
int ainc[CCV_NNC_MAX_DIM + 2];
int binc[CCV_NNC_MAX_DIM + 2];
int cinc[CCV_NNC_MAX_DIM + 2];
int x, z;
int k = 0;
// Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
for (z = 1; z < input_size; z++)
{
ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)outputs[0];
ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z];
if (c->data.f32 == a->data.f32)
{
k = z;
break;
}
}
for (z = 0; z < input_size - 1; z++)
{
ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)outputs[0];
ccv_nnc_tensor_view_t* a = z > 0 ? c : (ccv_nnc_tensor_view_t*)inputs[k];
ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)(z >= k ? inputs[z + 1] : inputs[z]);
assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
ccv_nnc_tensor_view_get_dim(a, dim);
ccv_nnc_tensor_view_check_dim(b, dim);
ccv_nnc_tensor_view_check_dim(c, dim);
if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c))
{
// Super optimal case, just do one for-loop for sum.
const int tensor_count = ccv_nnc_tensor_count(a->info);
for (x = 0; x < tensor_count; x++)
c->data.f32[x] = a->data.f32[x] + b->data.f32[x];
continue;
}
assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_inc(a, ainc);
ccv_nnc_tensor_view_get_inc(b, binc);
ccv_nnc_tensor_view_get_inc(c, cinc);
int i[CCV_NNC_MAX_DIM + 2];
float* ap = a->data.f32;
float* bp = b->data.f32;
float* cp = c->data.f32;
const int count = dim[2] * dim[3];
if (ainc[3] == dim[3] && binc[3] == dim[3] && cinc[3] == dim[3])
{
// Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (x = 0; x < count; x++)
cp[x] = ap[x] + bp[x];
ap += ainc[2] * ainc[3];
bp += binc[2] * binc[3];
cp += cinc[2] * cinc[3];
}
ap += (ainc[1] - dim[1]) * ainc[2] * ainc[0];
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
}
continue;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (i[2] = 0; i[2] < dim[2]; i[2]++)
{
for (x = 0; x < dim[3]; x++)
cp[x] = ap[x] + bp[x];
ap += ainc[3];
bp += binc[3];
cp += cinc[3];
}
ap += (ainc[2] - dim[2]) * ainc[3];
bp += (binc[2] - dim[2]) * binc[3];
cp += (cinc[2] - dim[2]) * cinc[3];
}
ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
}
}
return CCV_NNC_EXEC_SUCCESS;
}
static int _ccv_nnc_ewsum_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
{
// D[x + y + z, x] = 1
int i;
if (inputs[0] == 0)
{
// Set them to 1.
for (i = 0; i < output_size; i++)
if (outputs[i])
_ccv_nnc_tensor_set_cpu_ref((ccv_nnc_tensor_view_t*)outputs[i], 1);
} else {
// Copy over the gradient (If they are not pointing to the same tensor already).
for (i = 0; i < output_size; i++)
if (inputs[0] != outputs[i] && outputs[i])
_ccv_nnc_tensor_transfer_cpu_ref((ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)outputs[i]);
}
return CCV_NNC_EXEC_SUCCESS;
}
static int _ccv_nnc_ewprod_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
{
if (input_size == 1 && output_size == 1)
{
_ccv_nnc_tensor_transfer_cpu_ref((const ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)outputs[0]);
return CCV_NNC_EXEC_SUCCESS;
}
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM + 2];
int ainc[CCV_NNC_MAX_DIM + 2];
int binc[CCV_NNC_MAX_DIM + 2];
int cinc[CCV_NNC_MAX_DIM + 2];
int x, z;
int k = 0;
// Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
for (z = 1; z < input_size; z++)
{
ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)outputs[0];
ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z];
if (c->data.f32 == a->data.f32)
{
k = z;
break;
}
}
for (z = 0; z < input_size - 1; z++)
{
ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)outputs[0];
ccv_nnc_tensor_view_t* a = z > 0 ? c : (ccv_nnc_tensor_view_t*)inputs[k];
ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)(z >= k ? inputs[z + 1] : inputs[z]);
assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
ccv_nnc_tensor_view_get_dim(a, dim);
ccv_nnc_tensor_view_check_dim(b, dim);
ccv_nnc_tensor_view_check_dim(c, dim);
if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c))
{
// Super optimal case, just do one for-loop for sum.
const int tensor_count = ccv_nnc_tensor_count(a->info);
for (x = 0; x < tensor_count; x++)
c->data.f32[x] = a->data.f32[x] * b->data.f32[x];
continue;
}
assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_inc(a, ainc);
ccv_nnc_tensor_view_get_inc(b, binc);
ccv_nnc_tensor_view_get_inc(c, cinc);
int i[CCV_NNC_MAX_DIM + 2];
float* ap = a->data.f32;
float* bp = b->data.f32;
float* cp = c->data.f32;
const int count = dim[2] * dim[3];
if (ainc[3] == dim[3] && binc[3] == dim[3] && cinc[3] == dim[3])
{
// Special casing if the ainc[3] is the same as dim[3]
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (x = 0; x < count; x++)
cp[x] = ap[x] * bp[x];
ap += ainc[2] * ainc[3];
bp += binc[2] * binc[3];
cp += cinc[2] * cinc[3];
}
ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
}
continue;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (i[2] = 0; i[2] < dim[2]; i[2]++)
{
for (x = 0; x < dim[3]; x++)
cp[x] = ap[x] * bp[x];
ap += ainc[3];
bp += binc[3];
cp += cinc[3];
}
ap += (ainc[2] - dim[2]) * ainc[3];
bp += (binc[2] - dim[2]) * binc[3];
cp += (cinc[2] - dim[2]) * cinc[3];
}
ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
}
}
return CCV_NNC_EXEC_SUCCESS;
}
static int _ccv_nnc_ewprod_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
{
// D[x * y * z, x] = y * z
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM + 2];
int ginc[CCV_NNC_MAX_DIM + 2];
int ainc[CCV_NNC_MAX_DIM + 2];
int binc[CCV_NNC_MAX_DIM + 2];
int hinc[CCV_NNC_MAX_DIM + 2];
int x, z;
ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[output_size + 1];
if (g == 0)
{
assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
ccv_nnc_tensor_view_get_dim(b, dim);
ccv_nnc_tensor_view_get_inc(b, binc);
for (z = 0; z < output_size; z++)
{
ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z + 1];
ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[z];
assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(h->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
ccv_nnc_tensor_view_check_dim(a, dim);
ccv_nnc_tensor_view_check_dim(h, dim);
ccv_nnc_tensor_view_get_inc(a, ainc);
ccv_nnc_tensor_view_get_inc(h, hinc);
if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(h))
{
// Super optimal case, just do one for-loop for sum.
const int tensor_count = ccv_nnc_tensor_count(b->info);
for (x = 0; x < tensor_count; x++)
h->data.f32[x] = b->data.f32[x] / a->data.f32[x];
continue;
}
assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
int i[CCV_NNC_MAX_DIM + 2];
float* ap = a->data.f32;
float* bp = b->data.f32;
float* hp = h->data.f32;
const int count = dim[2] * dim[3];
if (ainc[3] == dim[3] && binc[3] == dim[3] && hinc[3] == dim[3])
{
// Special casing if the ainc[3] is the same as dim[3]
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (x = 0; x < count; x++)
hp[x] = bp[x] / ap[x];
ap += ainc[2] * ainc[3];
bp += binc[2] * binc[3];
hp += hinc[2] * hinc[3];
}
ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
}
continue;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (i[2] = 0; i[2] < dim[2]; i[2]++)
{
for (x = 0; x < dim[3]; x++)
hp[x] = bp[x] / ap[x];
ap += ainc[3];
bp += binc[3];
hp += hinc[3];
}
ap += (ainc[2] - dim[2]) * ainc[3];
bp += (binc[2] - dim[2]) * binc[3];
hp += (hinc[2] - dim[2]) * hinc[3];
}
ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
}
}
} else {
assert(g->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
ccv_nnc_tensor_view_get_dim(b, dim);
ccv_nnc_tensor_view_check_dim(g, dim);
ccv_nnc_tensor_view_get_inc(b, binc);
ccv_nnc_tensor_view_get_inc(g, ginc);
for (z = 0; z < output_size; z++)
{
ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z + 1];
ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[z];
assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(h->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
ccv_nnc_tensor_view_check_dim(a, dim);
ccv_nnc_tensor_view_check_dim(h, dim);
ccv_nnc_tensor_view_get_inc(a, ainc);
ccv_nnc_tensor_view_get_inc(h, hinc);
if (!CCV_IS_TENSOR_VIEW(g) && !CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(h))
{
// Super optimal case, just do one for-loop for sum.
const int tensor_count = ccv_nnc_tensor_count(g->info);
for (x = 0; x < tensor_count; x++)
h->data.f32[x] = g->data.f32[x] * b->data.f32[x] / a->data.f32[x];
continue;
}
assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
int i[CCV_NNC_MAX_DIM + 2];
float* gp = g->data.f32;
float* ap = a->data.f32;
float* bp = b->data.f32;
float* hp = h->data.f32;
const int count = dim[2] * dim[3];
if (ginc[3] == dim[3] && ainc[3] == dim[3] && binc[3] == dim[3] && hinc[3] == dim[3])
{
// Special casing if the ainc[3] is the same as dim[3]
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (x = 0; x < count; x++)
hp[x] = gp[x] * bp[x] / ap[x];
gp += ginc[2] * ginc[3];
ap += ainc[2] * ainc[3];
bp += binc[2] * binc[3];
hp += hinc[2] * hinc[3];
}
gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
}
continue;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (i[2] = 0; i[2] < dim[2]; i[2]++)
{
for (x = 0; x < dim[3]; x++)
hp[x] = gp[x] * bp[x] / ap[x];
gp += ginc[3];
ap += ainc[3];
bp += binc[3];
hp += hinc[3];
}
gp += (ginc[2] - dim[2]) * ginc[3];
ap += (ainc[2] - dim[2]) * ainc[3];
bp += (binc[2] - dim[2]) * binc[3];
hp += (hinc[2] - dim[2]) * hinc[3];
}
gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
}
}
}
return CCV_NNC_EXEC_SUCCESS;
}
static int _ccv_nnc_ewdiv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
{
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM + 2];
int ainc[CCV_NNC_MAX_DIM + 2];
int binc[CCV_NNC_MAX_DIM + 2];
int cinc[CCV_NNC_MAX_DIM + 2];
ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[1];
ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)outputs[0];
if (a == 0) // Take 0 as all ones tensor.
{
assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
ccv_nnc_tensor_view_get_dim(b, dim);
ccv_nnc_tensor_view_check_dim(c, dim);
int x;
if (!CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c))
{
// Super optimal case, just do one for-loop for sum.
const int tensor_count = ccv_nnc_tensor_count(b->info);
for (x = 0; x < tensor_count; x++)
c->data.f32[x] = 1 / b->data.f32[x];
return CCV_NNC_EXEC_SUCCESS;
}
assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_inc(b, binc);
ccv_nnc_tensor_view_get_inc(c, cinc);
int i[CCV_NNC_MAX_DIM + 2];
float* bp = b->data.f32;
float* cp = c->data.f32;
const int count = dim[2] * dim[3];
if (binc[3] == dim[3] && cinc[3] == dim[3])
{
// Special casing if the ainc[3] is the same as dim[3]
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (x = 0; x < count; x++)
cp[x] = 1 / bp[x];
bp += binc[2] * binc[3];
cp += cinc[2] * cinc[3];
}
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
}
return CCV_NNC_EXEC_SUCCESS;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (i[2] = 0; i[2] < dim[2]; i[2]++)
{
for (x = 0; x < dim[3]; x++)
cp[x] = 1 / bp[x];
bp += binc[3];
cp += cinc[3];
}
bp += (binc[2] - dim[2]) * binc[3];
cp += (cinc[2] - dim[2]) * cinc[3];
}
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
}
} else {
assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
ccv_nnc_tensor_view_get_dim(a, dim);
ccv_nnc_tensor_view_check_dim(b, dim);
ccv_nnc_tensor_view_check_dim(c, dim);
int x;
if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c))
{
// Super optimal case, just do one for-loop for sum.
const int tensor_count = ccv_nnc_tensor_count(a->info);
for (x = 0; x < tensor_count; x++)
c->data.f32[x] = a->data.f32[x] / b->data.f32[x];
return CCV_NNC_EXEC_SUCCESS;
}
assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_inc(a, ainc);
ccv_nnc_tensor_view_get_inc(b, binc);
ccv_nnc_tensor_view_get_inc(c, cinc);
int i[CCV_NNC_MAX_DIM + 2];
float* ap = a->data.f32;
float* bp = b->data.f32;
float* cp = c->data.f32;
const int count = dim[2] * dim[3];
if (ainc[3] == dim[3] && binc[3] == dim[3] && cinc[3] == dim[3])
{
// Special casing if the ainc[3] is the same as dim[3]
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (x = 0; x < count; x++)
cp[x] = ap[x] / bp[x];
ap += ainc[2] * ainc[3];
bp += binc[2] * binc[3];
cp += cinc[2] * cinc[3];
}
ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
}
return CCV_NNC_EXEC_SUCCESS;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (i[2] = 0; i[2] < dim[2]; i[2]++)
{
for (x = 0; x < dim[3]; x++)
cp[x] = ap[x] / bp[x];
ap += ainc[3];
bp += binc[3];
cp += cinc[3];
}
ap += (ainc[2] - dim[2]) * ainc[3];
bp += (binc[2] - dim[2]) * binc[3];
cp += (cinc[2] - dim[2]) * cinc[3];
}
ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
}
}
return CCV_NNC_EXEC_SUCCESS;
}
static int _ccv_nnc_ewdiv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
{
// D[x / y, x] = 1 / y, D[x / y, y] = -x / y^2
if (output_size == 1 || outputs[1] == 0)
{
// When we only need D[x / y, y]
ccv_nnc_cmd_t forw_cmd = cmd;
forw_cmd.cmd = CCV_NNC_EWDIV_FORWARD;
return _ccv_nnc_ewdiv_forw(cmd, ccv_nnc_no_hint, flags, TENSOR_LIST(inputs[0], inputs[2]), &outputs[0], 1, stream_context);
}
int dim[CCV_NNC_MAX_DIM + 2];
int ginc[CCV_NNC_MAX_DIM + 2];
int binc[CCV_NNC_MAX_DIM + 2];
int cinc[CCV_NNC_MAX_DIM + 2];
int hainc[CCV_NNC_MAX_DIM + 2];
int hbinc[CCV_NNC_MAX_DIM + 2];
ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[2];
ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)inputs[3];
ccv_nnc_tensor_view_t* ha = (ccv_nnc_tensor_view_t*)outputs[0];
ccv_nnc_tensor_view_t* hb = (ccv_nnc_tensor_view_t*)outputs[1];
if (g == 0)
{
assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(ha->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(hb->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
ccv_nnc_tensor_view_get_dim(b, dim);
ccv_nnc_tensor_view_check_dim(c, dim);
ccv_nnc_tensor_view_check_dim(ha, dim);
ccv_nnc_tensor_view_check_dim(hb, dim);
int x;
if (!CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && !CCV_IS_TENSOR_VIEW(ha) && !CCV_IS_TENSOR_VIEW(hb))
{
// Super optimal case, just do one for-loop for sum.
const int tensor_count = ccv_nnc_tensor_count(b->info);
for (x = 0; x < tensor_count; x++)
{
const float v = 1 / b->data.f32[x];
ha->data.f32[x] = v;
hb->data.f32[x] = -c->data.f32[x] * v;
}
return CCV_NNC_EXEC_SUCCESS;
}
assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_inc(b, binc);
ccv_nnc_tensor_view_get_inc(c, cinc);
ccv_nnc_tensor_view_get_inc(ha, hainc);
ccv_nnc_tensor_view_get_inc(hb, hbinc);
int i[CCV_NNC_MAX_DIM + 2];
float* bp = b->data.f32;
float* cp = c->data.f32;
float* hap = ha->data.f32;
float* hbp = hb->data.f32;
const int count = dim[2] * dim[3];
if (binc[3] == dim[3] && cinc[3] == dim[3] && hainc[3] == dim[3] && hbinc[3] == dim[3])
{
// Special casing if the ainc[3] is the same as dim[3]
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (x = 0; x < count; x++)
{
const float v = 1 / bp[x];
hap[x] = v;
hbp[x] = -cp[x] * v;
}
bp += binc[2] * binc[3];
cp += cinc[2] * cinc[3];
hap += hainc[2] * hainc[3];
hbp += hbinc[2] * hbinc[3];
}
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
hap += (hainc[1] - dim[1]) * hainc[2] * hainc[3];
hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
}
return CCV_NNC_EXEC_SUCCESS;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (i[2] = 0; i[2] < dim[2]; i[2]++)
{
for (x = 0; x < dim[3]; x++)
{
const float v = 1 / bp[x];
hap[x] = v;
hbp[x] = -cp[x] * v;
}
bp += binc[3];
cp += cinc[3];
hap += hainc[3];
hbp += hbinc[3];
}
bp += (binc[2] - dim[2]) * binc[3];
cp += (cinc[2] - dim[2]) * cinc[3];
hap += (hainc[2] - dim[2]) * hainc[3];
hbp += (hbinc[2] - dim[2]) * hbinc[3];
}
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
hap += (hainc[1] - dim[1]) * hainc[2] * hainc[3];
hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
}
} else {
assert(g->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(ha->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(hb->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
ccv_nnc_tensor_view_get_dim(b, dim);
ccv_nnc_tensor_view_check_dim(g, dim);
ccv_nnc_tensor_view_check_dim(c, dim);
ccv_nnc_tensor_view_check_dim(ha, dim);
ccv_nnc_tensor_view_check_dim(hb, dim);
int x;
if (!CCV_IS_TENSOR_VIEW(g) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && !CCV_IS_TENSOR_VIEW(ha) && !CCV_IS_TENSOR_VIEW(hb))
{
// Super optimal case, just do one for-loop for sum.
const int tensor_count = ccv_nnc_tensor_count(g->info);
for (x = 0; x < tensor_count; x++)
{
const float v = g->data.f32[x] / b->data.f32[x];
ha->data.f32[x] = v;
hb->data.f32[x] = -c->data.f32[x] * v;
}
return CCV_NNC_EXEC_SUCCESS;
}
assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_inc(g, ginc);
ccv_nnc_tensor_view_get_inc(b, binc);
ccv_nnc_tensor_view_get_inc(c, cinc);
ccv_nnc_tensor_view_get_inc(ha, hainc);
ccv_nnc_tensor_view_get_inc(hb, hbinc);
int i[CCV_NNC_MAX_DIM + 2];
float* gp = g->data.f32;
float* bp = b->data.f32;
float* cp = c->data.f32;
float* hap = ha->data.f32;
float* hbp = hb->data.f32;
const int count = dim[2] * dim[3];
if (ginc[3] == dim[3] && binc[3] == dim[3] && cinc[3] == dim[3] && hainc[3] == dim[3] && hbinc[3] == dim[3])
{
// Special casing if the ainc[3] is the same as dim[3]
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (x = 0; x < count; x++)
{
const float v = gp[x] / bp[x];
hap[x] = v;
hbp[x] = -cp[x] * v;
}
gp += ginc[2] * ginc[3];
bp += binc[2] * binc[3];
cp += cinc[2] * cinc[3];
hap += hainc[2] * hainc[3];
hbp += hbinc[2] * hbinc[3];
}
gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
hap += (hainc[1] - dim[1]) * hainc[2] * hainc[3];
hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
}
return CCV_NNC_EXEC_SUCCESS;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (i[2] = 0; i[2] < dim[2]; i[2]++)
{
for (x = 0; x < dim[3]; x++)
{
const float v = gp[x] / bp[x];
hap[x] = v;
hbp[x] = -cp[x] * v;
}
gp += ginc[3];
bp += binc[3];
cp += cinc[3];
hap += hainc[3];
hbp += hbinc[3];
}
gp += (ginc[2] - dim[2]) * ginc[3];
bp += (binc[2] - dim[2]) * binc[3];
cp += (cinc[2] - dim[2]) * cinc[3];
hap += (hainc[2] - dim[2]) * hainc[3];
hbp += (hbinc[2] - dim[2]) * hbinc[3];
}
gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
hap += (hainc[1] - dim[1]) * hainc[2] * hainc[3];
hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
}
}
return CCV_NNC_EXEC_SUCCESS;
}
static int _ccv_nnc_ewexp_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
{
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM + 2];
int ainc[CCV_NNC_MAX_DIM + 2];
int binc[CCV_NNC_MAX_DIM + 2];
ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
ccv_nnc_tensor_view_get_dim(a, dim);
ccv_nnc_tensor_view_check_dim(b, dim);
int x;
if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
{
// Super optimal case, just do one for-loop for sum.
const int tensor_count = ccv_nnc_tensor_count(a->info);
for (x = 0; x < tensor_count; x++)
b->data.f32[x] = exp(a->data.f32[x]);
return CCV_NNC_EXEC_SUCCESS;
}
assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_inc(a, ainc);
ccv_nnc_tensor_view_get_inc(b, binc);
int i[CCV_NNC_MAX_DIM + 2];
float* ap = a->data.f32;
float* bp = b->data.f32;
const int count = dim[2] * dim[3];
if (ainc[3] == dim[3] && binc[3] == dim[3])
{
// Special casing if the ainc[3] is the same as dim[3]
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (x = 0; x < count; x++)
bp[x] = exp(ap[x]);
ap += ainc[2] * ainc[3];
bp += binc[2] * binc[3];
}
ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
}
return CCV_NNC_EXEC_SUCCESS;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (i[2] = 0; i[2] < dim[2]; i[2]++)
{
for (x = 0; x < dim[3]; x++)
bp[x] = exp(ap[x]);
ap += ainc[3];
bp += binc[3];
}
ap += (ainc[2] - dim[2]) * ainc[3];
bp += (binc[2] - dim[2]) * binc[3];
}
ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
}
return CCV_NNC_EXEC_SUCCESS;
}
static int _ccv_nnc_ewexp_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
{
// D[Exp[x], x] = Exp[x]
if (inputs[0] == 0)
{
_ccv_nnc_tensor_transfer_cpu_ref((ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
return CCV_NNC_EXEC_SUCCESS;
} else {
ccv_nnc_cmd_t forw_cmd = cmd;
forw_cmd.cmd = CCV_NNC_EWPROD_FORWARD;
return _ccv_nnc_ewprod_forw(cmd, ccv_nnc_no_hint, flags, TENSOR_LIST(inputs[0], inputs[2]), outputs, output_size, stream_context);
}
}
static int _ccv_nnc_ewlog_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
{
// Assuming this is float 32.
int dim[CCV_NNC_MAX_DIM + 2];
int ainc[CCV_NNC_MAX_DIM + 2];
int binc[CCV_NNC_MAX_DIM + 2];
ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
ccv_nnc_tensor_view_get_dim(a, dim);
ccv_nnc_tensor_view_check_dim(b, dim);
int x;
if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
{
// Super optimal case, just do one for-loop for sum.
const int tensor_count = ccv_nnc_tensor_count(a->info);
for (x = 0; x < tensor_count; x++)
b->data.f32[x] = log(a->data.f32[x]);
return CCV_NNC_EXEC_SUCCESS;
}
assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
ccv_nnc_tensor_view_get_inc(a, ainc);
ccv_nnc_tensor_view_get_inc(b, binc);
int i[CCV_NNC_MAX_DIM + 2];
float* ap = a->data.f32;
float* bp = b->data.f32;
const int count = dim[2] * dim[3];
if (ainc[3] == dim[3] && binc[3] == dim[3])
{
// Special casing if the ainc[3] is the same as dim[3]
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (x = 0; x < count; x++)
bp[x] = log(ap[x]);
ap += ainc[2] * ainc[3];
bp += binc[2] * binc[3];
}
ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
}
return CCV_NNC_EXEC_SUCCESS;
}
// Non-optimal case, need to do skip copy.
for (i[0] = 0; i[0] < dim[0]; i[0]++)
{
for (i[1] = 0; i[1] < dim[1]; i[1]++)
{
for (i[2] = 0; i[2] < dim[2]; i[2]++)
{
for (x = 0; x < dim[0]; x++)
bp[x] = log(ap[x]);
ap += ainc[3];
bp += binc[3];
}
ap += (ainc[2] - dim[2]) * ainc[3];
bp += (binc[2] - dim[2]) * binc[3];
}
ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
bp += (binc[1] - dim[1]) * binc[2] * binc[3];
}
return CCV_NNC_EXEC_SUCCESS;
}
static int _ccv_nnc_ewlog_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
{
ccv_nnc_cmd_t forw_cmd = cmd;
forw_cmd.cmd = CCV_NNC_EWDIV_FORWARD;
// D[Log[x], x] = 1 / x
return _ccv_nnc_ewdiv_forw(forw_cmd, ccv_nnc_no_hint, flags, TENSOR_LIST(inputs[0], inputs[1]), outputs, output_size, stream_context);
// Otherwise, need to add them together.
return CCV_NNC_EXEC_SUCCESS;
}
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewsum_forw_cpu_ref;
}
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSUM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewsum_back;
}
REGISTER_COMMAND_BACKEND(CCV_NNC_EWPROD_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewprod_forw;
}
REGISTER_COMMAND_BACKEND(CCV_NNC_EWPROD_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewprod_back;
}
REGISTER_COMMAND_BACKEND(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewdiv_forw;
}
REGISTER_COMMAND_BACKEND(CCV_NNC_EWDIV_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewdiv_back;
}
REGISTER_COMMAND_BACKEND(CCV_NNC_EWEXP_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewexp_forw;
}
REGISTER_COMMAND_BACKEND(CCV_NNC_EWEXP_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewexp_back;
}
REGISTER_COMMAND_BACKEND(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewlog_forw;
}
REGISTER_COMMAND_BACKEND(CCV_NNC_EWLOG_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
{
registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
registry->tensor_datatypes = CCV_32F;
registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
registry->algorithms = 1;
registry->exec = _ccv_nnc_ewlog_back;
}