From 9789be109edea8ba463bf0afe55db8332c6f88fa Mon Sep 17 00:00:00 2001 From: Gregory Comer Date: Mon, 15 Jun 2026 16:35:53 -0700 Subject: [PATCH 1/2] Update [ghstack-poisoned] --- .../xnnpack/runtime/executor/executor.cpp | 12 +- .../runtime/kernels/layer_norm/layer_norm.cpp | 20 ++ .../runtime/kernels/layer_norm/layer_norm.h | 18 ++ .../kernels/layer_norm/layer_norm_neon.cpp | 225 ++++++++++++++++++ .../kernels/layer_norm/layer_norm_neon.h | 16 ++ .../kernels/layer_norm/layer_norm_scalar.cpp | 46 ++++ .../kernels/layer_norm/layer_norm_scalar.h | 16 ++ .../xnnpack/runtime/operators/layer_norm.cpp | 80 +++++++ .../xnnpack/runtime/operators/layer_norm.h | 22 ++ .../xnnpack/runtime/operators/operator.cpp | 13 +- backends/xnnpack/runtime/operators/operator.h | 19 +- .../xnnpack/runtime/plan/execution_plan.cpp | 5 +- backends/xnnpack/test/runtime/test_e2e.cpp | 132 ++++++++++ .../executorch/build/build_variables.bzl | 4 + 14 files changed, 614 insertions(+), 14 deletions(-) create mode 100644 backends/xnnpack/runtime/kernels/layer_norm/layer_norm.cpp create mode 100644 backends/xnnpack/runtime/kernels/layer_norm/layer_norm.h create mode 100644 backends/xnnpack/runtime/kernels/layer_norm/layer_norm_neon.cpp create mode 100644 backends/xnnpack/runtime/kernels/layer_norm/layer_norm_neon.h create mode 100644 backends/xnnpack/runtime/kernels/layer_norm/layer_norm_scalar.cpp create mode 100644 backends/xnnpack/runtime/kernels/layer_norm/layer_norm_scalar.h create mode 100644 backends/xnnpack/runtime/operators/layer_norm.cpp create mode 100644 backends/xnnpack/runtime/operators/layer_norm.h diff --git a/backends/xnnpack/runtime/executor/executor.cpp b/backends/xnnpack/runtime/executor/executor.cpp index 262aec30c30..9ef2e6a2a6a 100644 --- a/backends/xnnpack/runtime/executor/executor.cpp +++ b/backends/xnnpack/runtime/executor/executor.cpp @@ -64,9 +64,12 @@ runtime::Error Executor::run_step(size_t step_idx, const plan::PlanStep& step) { } auto t0 = std::chrono::steady_clock::now(); - s.op->execute( + err = s.op->execute( {inputs.data(), inputs.size()}, {outputs.data(), outputs.size()}); + if (err != runtime::Error::Ok) { + return; + } auto t1 = std::chrono::steady_clock::now(); auto us = std::chrono::duration_cast(t1 - t0) @@ -231,7 +234,8 @@ runtime::Error Executor::update_planned_memory(Span inputs) { for (auto slot : op_step->input_slots) { input_specs.push_back(memory_plan.value_specs[slot]); } - op_step->op->reshape({input_specs.data(), input_specs.size()}); + ET_CHECK_OK_OR_RETURN_ERROR( + op_step->op->reshape({input_specs.data(), input_specs.size()})); } return runtime::Error::Ok; @@ -296,8 +300,8 @@ runtime::Result Executor::build(graph::Graph& graph) { for (auto slot : op_step->output_slots) outputs.push_back(&values[slot]); - op_step->op->prepare( - {inputs.data(), inputs.size()}, {outputs.data(), outputs.size()}); + ET_CHECK_OK_OR_RETURN_ERROR(op_step->op->prepare( + {inputs.data(), inputs.size()}, {outputs.data(), outputs.size()})); } auto t4 = std::chrono::steady_clock::now(); diff --git a/backends/xnnpack/runtime/kernels/layer_norm/layer_norm.cpp b/backends/xnnpack/runtime/kernels/layer_norm/layer_norm.cpp new file mode 100644 index 00000000000..aca99dde285 --- /dev/null +++ b/backends/xnnpack/runtime/kernels/layer_norm/layer_norm.cpp @@ -0,0 +1,20 @@ +#include +#include +#ifdef __aarch64__ +#include +#endif + +#include + +namespace executorch::backends::xnnpack::kernels { + +LayerNormF32Fn select_layer_norm_f32_kernel() { +#ifdef __aarch64__ + if (cpuinfo_initialize() && cpuinfo_has_arm_neon()) { + return layer_norm_f32_neon; + } +#endif + return layer_norm_f32_scalar; +} + +} // namespace executorch::backends::xnnpack::kernels diff --git a/backends/xnnpack/runtime/kernels/layer_norm/layer_norm.h b/backends/xnnpack/runtime/kernels/layer_norm/layer_norm.h new file mode 100644 index 00000000000..beeebaaffc6 --- /dev/null +++ b/backends/xnnpack/runtime/kernels/layer_norm/layer_norm.h @@ -0,0 +1,18 @@ +#pragma once + +#include + +namespace executorch::backends::xnnpack::kernels { + +using LayerNormF32Fn = void (*)( + const float* input, + float* output, + const float* weight, + const float* bias, + size_t outer_size, + size_t inner_size, + float eps); + +LayerNormF32Fn select_layer_norm_f32_kernel(); + +} // namespace executorch::backends::xnnpack::kernels diff --git a/backends/xnnpack/runtime/kernels/layer_norm/layer_norm_neon.cpp b/backends/xnnpack/runtime/kernels/layer_norm/layer_norm_neon.cpp new file mode 100644 index 00000000000..3251e7bd754 --- /dev/null +++ b/backends/xnnpack/runtime/kernels/layer_norm/layer_norm_neon.cpp @@ -0,0 +1,225 @@ +#ifdef __aarch64__ + +#include + +#include +#include +#include + +namespace executorch::backends::xnnpack::kernels { + +namespace { +float sum_f32_neon(const float* data, size_t len) { + float32x4_t acc0 = vdupq_n_f32(0); + float32x4_t acc1 = vdupq_n_f32(0); + float32x4_t acc2 = vdupq_n_f32(0); + float32x4_t acc3 = vdupq_n_f32(0); + float32x4_t acc4 = vdupq_n_f32(0); + float32x4_t acc5 = vdupq_n_f32(0); + float32x4_t acc6 = vdupq_n_f32(0); + float32x4_t acc7 = vdupq_n_f32(0); + + size_t i = len; + for (; i >= 32; i -= 32) { + float32x4x2_t in01 = vld1q_f32_x2(data); + float32x4x2_t in23 = vld1q_f32_x2(data + 8); + float32x4x2_t in45 = vld1q_f32_x2(data + 16); + float32x4x2_t in67 = vld1q_f32_x2(data + 24); + + acc0 = vaddq_f32(acc0, in01.val[0]); + acc1 = vaddq_f32(acc1, in01.val[1]); + acc2 = vaddq_f32(acc2, in23.val[0]); + acc3 = vaddq_f32(acc3, in23.val[1]); + acc4 = vaddq_f32(acc4, in45.val[0]); + acc5 = vaddq_f32(acc5, in45.val[1]); + acc6 = vaddq_f32(acc6, in67.val[0]); + acc7 = vaddq_f32(acc7, in67.val[1]); + + data += 32; + } + + acc0 = vaddq_f32(acc0, acc1); + acc2 = vaddq_f32(acc2, acc3); + acc4 = vaddq_f32(acc4, acc5); + acc6 = vaddq_f32(acc6, acc7); + + acc0 = vaddq_f32(acc0, acc2); + acc4 = vaddq_f32(acc4, acc6); + + acc0 = vaddq_f32(acc0, acc4); + + for (; i >= 4; i -= 4) { + float32x4_t in = vld1q_f32(data); + acc0 = vaddq_f32(acc0, in); + data += 4; + } + + float acc = vaddvq_f32(acc0); + + for (; i > 0; i--) { + acc += *data; + data++; + } + + return acc; +} + +float var_sum_f32_neon(const float* data, float mean, size_t len) { + float32x4_t vmean = vdupq_n_f32(mean); + + float32x4_t acc0 = vdupq_n_f32(0); + float32x4_t acc1 = vdupq_n_f32(0); + float32x4_t acc2 = vdupq_n_f32(0); + float32x4_t acc3 = vdupq_n_f32(0); + + size_t i = len; + for (; i >= 16; i -= 16) { + float32x4x2_t in01 = vld1q_f32_x2(data); + float32x4x2_t in23 = vld1q_f32_x2(data + 8); + + float32x4_t delta0 = vsubq_f32(in01.val[0], vmean); + float32x4_t delta1 = vsubq_f32(in01.val[1], vmean); + float32x4_t delta2 = vsubq_f32(in23.val[0], vmean); + float32x4_t delta3 = vsubq_f32(in23.val[1], vmean); + + float32x4_t delta_sq0 = vmulq_f32(delta0, delta0); + float32x4_t delta_sq1 = vmulq_f32(delta1, delta1); + float32x4_t delta_sq2 = vmulq_f32(delta2, delta2); + float32x4_t delta_sq3 = vmulq_f32(delta3, delta3); + + acc0 = vaddq_f32(acc0, delta_sq0); + acc1 = vaddq_f32(acc1, delta_sq1); + acc2 = vaddq_f32(acc2, delta_sq2); + acc3 = vaddq_f32(acc3, delta_sq3); + + data += 16; + } + + acc0 = vaddq_f32(acc0, acc1); + acc2 = vaddq_f32(acc2, acc3); + acc0 = vaddq_f32(acc0, acc2); + + for (; i >= 4; i -= 4) { + float32x4_t in = vld1q_f32(data); + float32x4_t delta = vsubq_f32(in, vmean); + float32x4_t delta_sq = vmulq_f32(delta, delta); + acc0 = vaddq_f32(acc0, delta_sq); + data += 4; + } + + float acc = vaddvq_f32(acc0); + + for (; i > 0; i--) { + float in = *data; + float delta = in - mean; + float delta_sq = delta * delta; + acc += delta_sq; + data++; + } + + return acc; +} + +template +void normalize_f32_neon( + const float* input, + float mean, + float inv_std, + const float* weight, + const float* bias, + float* out, + size_t len) { + float32x4_t vmean = vdupq_n_f32(mean); + float32x4_t vinv_std = vdupq_n_f32(inv_std); + + size_t i = len; + for (; i >= 16; i -= 16) { + float32x4x2_t in01 = vld1q_f32_x2(input); + float32x4x2_t in23 = vld1q_f32_x2(input + 8); + + float32x4_t norm0 = vmulq_f32(vsubq_f32(in01.val[0], vmean), vinv_std); + float32x4_t norm1 = vmulq_f32(vsubq_f32(in01.val[1], vmean), vinv_std); + float32x4_t norm2 = vmulq_f32(vsubq_f32(in23.val[0], vmean), vinv_std); + float32x4_t norm3 = vmulq_f32(vsubq_f32(in23.val[1], vmean), vinv_std); + + if constexpr (UseWeightBias) { + float32x4x2_t w01 = vld1q_f32_x2(weight); + float32x4x2_t w23 = vld1q_f32_x2(weight + 8); + + float32x4x2_t b01 = vld1q_f32_x2(bias); + float32x4x2_t b23 = vld1q_f32_x2(bias + 8); + + norm0 = vmlaq_f32(b01.val[0], norm0, w01.val[0]); + norm1 = vmlaq_f32(b01.val[1], norm1, w01.val[1]); + norm2 = vmlaq_f32(b23.val[0], norm2, w23.val[0]); + norm3 = vmlaq_f32(b23.val[1], norm3, w23.val[1]); + + weight += 16; + bias += 16; + } + + vst1q_f32(out, norm0); + vst1q_f32(out + 4, norm1); + vst1q_f32(out + 8, norm2); + vst1q_f32(out + 12, norm3); + + input += 16; + out += 16; + } + + for (; i > 0; i--) { + float in = *input; + float norm = (in - mean) * inv_std; + + if constexpr (UseWeightBias) { + auto w = *weight; + auto b = *bias; + + norm = (norm * w) + b; + + weight++; + bias++; + } + + *out = norm; + + input++; + out++; + } +} +} // anonymous namespace + +void layer_norm_f32_neon( + const float* input, + float* output, + const float* weight, + const float* bias, + size_t outer_size, + size_t inner_size, + float eps) { + for (size_t i = 0; i < outer_size; i++) { + const float* in_row = input + i * inner_size; + float* out_row = output + i * inner_size; + + float sum = sum_f32_neon(in_row, inner_size); + float mean = sum / static_cast(inner_size); + + float var_sum = var_sum_f32_neon(in_row, mean, inner_size); + float inv_std = + 1.0f / std::sqrt(var_sum / static_cast(inner_size) + eps); + + if (weight != nullptr) { + assert(bias != nullptr); + normalize_f32_neon( + in_row, mean, inv_std, weight, bias, out_row, inner_size); + } else { + assert(bias == nullptr); + normalize_f32_neon( + in_row, mean, inv_std, nullptr, nullptr, out_row, inner_size); + } + } +} + +} // namespace executorch::backends::xnnpack::kernels + +#endif diff --git a/backends/xnnpack/runtime/kernels/layer_norm/layer_norm_neon.h b/backends/xnnpack/runtime/kernels/layer_norm/layer_norm_neon.h new file mode 100644 index 00000000000..64535cb6f2e --- /dev/null +++ b/backends/xnnpack/runtime/kernels/layer_norm/layer_norm_neon.h @@ -0,0 +1,16 @@ +#pragma once + +#include + +namespace executorch::backends::xnnpack::kernels { + +void layer_norm_f32_neon( + const float* input, + float* output, + const float* weight, + const float* bias, + size_t outer_size, + size_t inner_size, + float eps); + +} diff --git a/backends/xnnpack/runtime/kernels/layer_norm/layer_norm_scalar.cpp b/backends/xnnpack/runtime/kernels/layer_norm/layer_norm_scalar.cpp new file mode 100644 index 00000000000..8696cf58704 --- /dev/null +++ b/backends/xnnpack/runtime/kernels/layer_norm/layer_norm_scalar.cpp @@ -0,0 +1,46 @@ +#include + +#include + +namespace executorch::backends::xnnpack::kernels { + +void layer_norm_f32_scalar( + const float* input, + float* output, + const float* weight, + const float* bias, + size_t outer_size, + size_t inner_size, + float eps) { + for (size_t i = 0; i < outer_size; i++) { + const float* in_row = input + i * inner_size; + float* out_row = output + i * inner_size; + + float sum = 0.0f; + for (size_t j = 0; j < inner_size; j++) { + sum += in_row[j]; + } + float mean = sum / static_cast(inner_size); + + float var_sum = 0.0f; + for (size_t j = 0; j < inner_size; j++) { + float diff = in_row[j] - mean; + var_sum += diff * diff; + } + float inv_std = + 1.0f / std::sqrt(var_sum / static_cast(inner_size) + eps); + + for (size_t j = 0; j < inner_size; j++) { + float normalized = (in_row[j] - mean) * inv_std; + if (weight) { + normalized *= weight[j]; + } + if (bias) { + normalized += bias[j]; + } + out_row[j] = normalized; + } + } +} + +} // namespace executorch::backends::xnnpack::kernels diff --git a/backends/xnnpack/runtime/kernels/layer_norm/layer_norm_scalar.h b/backends/xnnpack/runtime/kernels/layer_norm/layer_norm_scalar.h new file mode 100644 index 00000000000..b53ae3a55dd --- /dev/null +++ b/backends/xnnpack/runtime/kernels/layer_norm/layer_norm_scalar.h @@ -0,0 +1,16 @@ +#pragma once + +#include + +namespace executorch::backends::xnnpack::kernels { + +void layer_norm_f32_scalar( + const float* input, + float* output, + const float* weight, + const float* bias, + size_t outer_size, + size_t inner_size, + float eps); + +} diff --git a/backends/xnnpack/runtime/operators/layer_norm.cpp b/backends/xnnpack/runtime/operators/layer_norm.cpp new file mode 100644 index 00000000000..72b6da5a13a --- /dev/null +++ b/backends/xnnpack/runtime/operators/layer_norm.cpp @@ -0,0 +1,80 @@ +#include + +#include +#include + +#include + +namespace executorch::backends::xnnpack::operators { + +runtime::Error LayerNorm::setup( + runtime::Span constant_args) { + ET_CHECK_OR_RETURN_ERROR( + constant_args.size() == 2, + InvalidArgument, + "LayerNorm expects 2 constant args (normalized dims, eps), got %zu", + constant_args.size()); + const auto* num_dims = std::get_if(&constant_args[0]); + const auto* eps = std::get_if(&constant_args[1]); + ET_CHECK_OR_RETURN_ERROR( + num_dims != nullptr && eps != nullptr, + InvalidArgument, + "LayerNorm constant args have unexpected types"); + kernel_ = kernels::select_layer_norm_f32_kernel(); + num_normalized_dims_ = static_cast(*num_dims); + eps_ = static_cast(*eps); + return runtime::Error::Ok; +} + +runtime::Error LayerNorm::execute( + runtime::Span inputs, + runtime::Span outputs) { + ET_CHECK_OR_RETURN_ERROR( + inputs.size() >= 1 && inputs.size() <= 3 && outputs.size() == 1, + InvalidArgument, + "LayerNorm expects 1-3 inputs and 1 output, got %zu inputs / %zu outputs", + inputs.size(), + outputs.size()); + + auto* input = inputs[0]; + auto* output = outputs[0]; + + ET_CHECK_OR_RETURN_ERROR( + input->dtype == core::DType::Float32 && + output->dtype == core::DType::Float32, + NotSupported, + "LayerNorm in-tree kernel only supports float32"); + ET_CHECK_OR_RETURN_ERROR( + num_normalized_dims_ <= input->sizes.size(), + InvalidArgument, + "LayerNorm normalized dims %u exceeds input rank %zu", + num_normalized_dims_, + input->sizes.size()); + + size_t split = input->sizes.size() - num_normalized_dims_; + size_t inner_size = 1; + for (size_t i = split; i < input->sizes.size(); i++) { + inner_size *= input->sizes[i]; + } + size_t outer_size = 1; + for (size_t i = 0; i < split; i++) { + outer_size *= input->sizes[i]; + } + + const float* weight = + (inputs.size() > 1) ? inputs[1]->data_const() : nullptr; + const float* bias = + (inputs.size() > 2) ? inputs[2]->data_const() : nullptr; + + kernel_( + input->data_const(), + output->data_mut(), + weight, + bias, + outer_size, + inner_size, + eps_); + return runtime::Error::Ok; +} + +} // namespace executorch::backends::xnnpack::operators diff --git a/backends/xnnpack/runtime/operators/layer_norm.h b/backends/xnnpack/runtime/operators/layer_norm.h new file mode 100644 index 00000000000..16207522020 --- /dev/null +++ b/backends/xnnpack/runtime/operators/layer_norm.h @@ -0,0 +1,22 @@ +#pragma once + +#include +#include + +namespace executorch::backends::xnnpack::operators { + +class LayerNorm : public Operator { + public: + runtime::Error setup( + runtime::Span constant_args) override; + runtime::Error execute( + runtime::Span inputs, + runtime::Span outputs) override; + + private: + kernels::LayerNormF32Fn kernel_ = nullptr; + uint32_t num_normalized_dims_ = 0; + float eps_ = 1e-5f; +}; + +} // namespace executorch::backends::xnnpack::operators diff --git a/backends/xnnpack/runtime/operators/operator.cpp b/backends/xnnpack/runtime/operators/operator.cpp index ae44797651e..a981a46245a 100644 --- a/backends/xnnpack/runtime/operators/operator.cpp +++ b/backends/xnnpack/runtime/operators/operator.cpp @@ -1,14 +1,19 @@ #include +#include #include namespace executorch::backends::xnnpack::operators { std::unique_ptr create_operator(graph::Operator op) { - // No in-tree operators are available yet; the graph runtime currently - // supports only XNNPACK-delegated subgraphs. Reaching this point means a - // node was routed to an in-tree kernel that has not been added. Return null - // so the caller can fail cleanly rather than aborting. + switch (op) { + case graph::Operator::LayerNorm: + return std::make_unique(); + default: + break; + } + // The node was routed to an in-tree kernel that has not been added. Return + // null so the caller can fail cleanly rather than aborting. ET_LOG( Error, "No in-tree kernel for operator %d; only XNNPACK-delegated nodes are " diff --git a/backends/xnnpack/runtime/operators/operator.h b/backends/xnnpack/runtime/operators/operator.h index c63ba0c9a68..d009b0e0822 100644 --- a/backends/xnnpack/runtime/operators/operator.h +++ b/backends/xnnpack/runtime/operators/operator.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -12,12 +13,20 @@ namespace executorch::backends::xnnpack::operators { class Operator { public: - virtual void setup(runtime::Span constant_args) {}; - virtual void prepare( + virtual runtime::Error setup( + runtime::Span constant_args) { + return runtime::Error::Ok; + } + virtual runtime::Error prepare( runtime::Span inputs, - runtime::Span outputs) {}; - virtual void reshape(runtime::Span input_specs) {}; - virtual void execute( + runtime::Span outputs) { + return runtime::Error::Ok; + } + virtual runtime::Error reshape( + runtime::Span input_specs) { + return runtime::Error::Ok; + } + virtual runtime::Error execute( runtime::Span inputs, runtime::Span outputs) = 0; virtual ~Operator() = default; diff --git a/backends/xnnpack/runtime/plan/execution_plan.cpp b/backends/xnnpack/runtime/plan/execution_plan.cpp index 326d38cb59e..8e62a318d31 100644 --- a/backends/xnnpack/runtime/plan/execution_plan.cpp +++ b/backends/xnnpack/runtime/plan/execution_plan.cpp @@ -83,7 +83,10 @@ runtime::Result> create_plan_steps( err = runtime::Error::NotSupported; return; } - op->setup({n.constant_args.data(), n.constant_args.size()}); + err = op->setup({n.constant_args.data(), n.constant_args.size()}); + if (err != runtime::Error::Ok) { + return; + } RunOperatorStep step; step.op = std::move(op); diff --git a/backends/xnnpack/test/runtime/test_e2e.cpp b/backends/xnnpack/test/runtime/test_e2e.cpp index 8e63108a1e7..9934dd83741 100644 --- a/backends/xnnpack/test/runtime/test_e2e.cpp +++ b/backends/xnnpack/test/runtime/test_e2e.cpp @@ -973,3 +973,135 @@ TEST(TestE2E, quantize_quint8) { EXPECT_EQ(d[2], 2); EXPECT_EQ(d[3], 3); } + +// LayerNorm is an in-tree (non-XNNPACK) operator, so these exercise the +// create_operator -> setup -> execute path and its error propagation. +TEST(TestE2E, layer_norm_basic) { + auto builder = GraphBuilder(); + auto spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(4)}}; + auto input = builder.createInput(spec); + auto ln = builder.createOperator( + Operator::LayerNorm, + spec, + {input}, + {ConstantArg{int64_t(1)}, ConstantArg{double(1e-5)}}); + builder.createOutput(ln); + + auto graph = builder.build(); + auto executor_result = Executor::build(graph); + ASSERT_TRUE(executor_result.ok()); + auto& executor = *executor_result; + + Tensor ti; + ti.dtype = DType::Float32; + ti.sizes = {1, 4}; + ti.storage = make_owned(4 * sizeof(float)); + auto* di = ti.data_mut(); + di[0] = 1; + di[1] = 2; + di[2] = 3; + di[3] = 4; + std::vector inputs; + inputs.push_back(std::move(ti)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + ASSERT_TRUE(outputs_result.ok()); + auto* d = (*outputs_result)[0].data_const(); + // mean 2.5, inv_std = 1/sqrt(1.25) ~= 0.894427 + EXPECT_NEAR(d[0], -1.341641f, 1e-3f); + EXPECT_NEAR(d[1], -0.447214f, 1e-3f); + EXPECT_NEAR(d[2], 0.447214f, 1e-3f); + EXPECT_NEAR(d[3], 1.341641f, 1e-3f); +} + +TEST(TestE2E, layer_norm_weight_bias) { + auto builder = GraphBuilder(); + auto spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(4)}}; + auto input = builder.createInput(spec); + + auto weight_tensor = std::make_shared(); + weight_tensor->dtype = DType::Float32; + weight_tensor->sizes = {4}; + weight_tensor->storage = make_owned(4 * sizeof(float)); + for (int i = 0; i < 4; i++) { + weight_tensor->data_mut()[i] = 2.0f; + } + auto weight = builder.createConstant(weight_tensor); + + auto bias_tensor = std::make_shared(); + bias_tensor->dtype = DType::Float32; + bias_tensor->sizes = {4}; + bias_tensor->storage = make_owned(4 * sizeof(float)); + for (int i = 0; i < 4; i++) { + bias_tensor->data_mut()[i] = 1.0f; + } + auto bias = builder.createConstant(bias_tensor); + + auto ln = builder.createOperator( + Operator::LayerNorm, + spec, + {input, weight, bias}, + {ConstantArg{int64_t(1)}, ConstantArg{double(1e-5)}}); + builder.createOutput(ln); + + auto graph = builder.build(); + auto executor_result = Executor::build(graph); + ASSERT_TRUE(executor_result.ok()); + auto& executor = *executor_result; + + Tensor ti; + ti.dtype = DType::Float32; + ti.sizes = {1, 4}; + ti.storage = make_owned(4 * sizeof(float)); + auto* di = ti.data_mut(); + di[0] = 1; + di[1] = 2; + di[2] = 3; + di[3] = 4; + std::vector inputs; + inputs.push_back(std::move(ti)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + ASSERT_TRUE(outputs_result.ok()); + auto* d = (*outputs_result)[0].data_const(); + // normalized * 2 + 1 + EXPECT_NEAR(d[0], -1.683282f, 1e-3f); + EXPECT_NEAR(d[1], 0.105573f, 1e-3f); + EXPECT_NEAR(d[2], 1.894427f, 1e-3f); + EXPECT_NEAR(d[3], 3.683282f, 1e-3f); +} + +TEST(TestE2E, layer_norm_bad_normalized_dims_errors) { + // normalized dims (3) exceeds the input rank (2): execute must return an + // error that propagates out of run() rather than reading out of bounds. + auto builder = GraphBuilder(); + auto spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(4)}}; + auto input = builder.createInput(spec); + auto ln = builder.createOperator( + Operator::LayerNorm, + spec, + {input}, + {ConstantArg{int64_t(3)}, ConstantArg{double(1e-5)}}); + builder.createOutput(ln); + + auto graph = builder.build(); + auto executor_result = Executor::build(graph); + ASSERT_TRUE(executor_result.ok()); + auto& executor = *executor_result; + + Tensor ti; + ti.dtype = DType::Float32; + ti.sizes = {1, 4}; + ti.storage = make_owned(4 * sizeof(float)); + std::vector inputs; + inputs.push_back(std::move(ti)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + EXPECT_FALSE(outputs_result.ok()); +} diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl index 7fb59fc210a..a505acc9d27 100644 --- a/shim_et/xplat/executorch/build/build_variables.bzl +++ b/shim_et/xplat/executorch/build/build_variables.bzl @@ -485,6 +485,10 @@ XNNPACK_BACKEND_BUCK_SRCS = [ "runtime/graph/graph.cpp", "runtime/graph/graph_builder.cpp", "runtime/operators/operator.cpp", + "runtime/operators/layer_norm.cpp", + "runtime/kernels/layer_norm/layer_norm.cpp", + "runtime/kernels/layer_norm/layer_norm_scalar.cpp", + "runtime/kernels/layer_norm/layer_norm_neon.cpp", "runtime/executor/arena.cpp", "runtime/executor/shape_env.cpp", "runtime/plan/partition.cpp", From 15ecf78ad4d2c69670f37a5356098dbe65e4217a Mon Sep 17 00:00:00 2001 From: Gregory Comer Date: Tue, 16 Jun 2026 13:06:56 -0700 Subject: [PATCH 2/2] Update [ghstack-poisoned] --- backends/xnnpack/targets.bzl | 1 + 1 file changed, 1 insertion(+) diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl index 4bafb6c4c5f..37ac45d89f9 100644 --- a/backends/xnnpack/targets.bzl +++ b/backends/xnnpack/targets.bzl @@ -44,6 +44,7 @@ def define_common_targets(): "runtime/core/*.h", "runtime/graph/*.h", "runtime/operators/*.h", + "runtime/kernels/**/*.h", "runtime/executor/*.h", "runtime/plan/*.h", ]),