Tensorium
Loading...
Searching...
No Matches
Derivate.hpp
Go to the documentation of this file.
1#pragma once
2
4#include "../SIMD/CPU_id.hpp"
5#include "../SIMD/SIMD.hpp"
6#include "Matrix.hpp"
7#include "Tensor.hpp"
8#include "Vector.hpp"
9#include <cassert>
10#include <cmath>
11#include <immintrin.h>
12#include <iostream>
13#include <numeric>
14#include <vector>
15
16/* ************************************************************************** */
22/* ************************************************************************** */
23
24namespace tensorium {
30template <typename K> class Derivate {
31 public:
32 size_t rows, cols;
34 size_t block_size;
41 Derivate(size_t r, size_t c)
42 : rows(r),
43 cols(c),
44 data(r * c, K()),
46 std::cout << "Auto-selected BLOCK_SIZE = " << block_size << std::endl;
47 }
54 : rows(m.rows),
55 cols(m.cols),
56 data(m.data),
65 K &operator()(size_t i, size_t j) {
66 if (i >= rows || j >= cols) {
67 std::cerr << "[OOB] operator(): "
68 << "i=" << i << " rows=" << rows << " | j=" << j << " cols=" << cols << "\n";
69 }
70 assert(i < rows && j < cols);
71 return data[i * cols + j];
72 }
80 const K &operator()(size_t i, size_t j) const {
81 assert(i < rows && j < cols && "Derivate::operator() const: indice hors bornes");
82 return data[i * cols + j];
83 }
89 size_t size() const { return rows * cols; }
98 __attribute__((always_inline, hot, flatten)) inline void
99 centered_derivative(const Derivate<K> &input, Derivate<K> &output, size_t axis, K dx) const {
101 using reg = typename Simd::reg;
102 const size_t simd_width = Simd::width;
103 const K inv_2dx = K(1) / (K(2) * dx);
104
105 if (axis == 1) {
106#pragma omp parallel for
107 for (size_t i = 0; i < input.rows; ++i) {
108 output(i, 0) = K(0);
109
110 size_t j = 1;
111 const size_t simd_end =
112 (input.cols > simd_width + 1) ? (input.cols - simd_width - 1) : 1;
113 for (; j < simd_end; j += simd_width) {
114 if (j + simd_width > input.cols - 1)
115 break;
116
117 const K *left_ptr = &input(i, j - 1);
118 const K *right_ptr = &input(i, j + 1);
119 K *out_ptr = &output(i, j);
120
121 reg left = Simd::loadu(left_ptr);
122 reg right = Simd::loadu(right_ptr);
123 reg diff = Simd::sub(right, left);
124 reg res = Simd::mul(diff, Simd::set1(inv_2dx));
125 Simd::storeu(out_ptr, res);
126 }
127
128 for (; j < input.cols - 1; ++j)
129 output(i, j) = (input(i, j + 1) - input(i, j - 1)) * inv_2dx;
130
131 output(i, input.cols - 1) = K(0);
132 }
133 }
134
135 else if (axis == 0) {
136#pragma omp parallel for
137 for (size_t i = 1; i < input.rows - 1; ++i)
138 for (size_t j = 0; j < input.cols; ++j)
139 output(i, j) = (input(i + 1, j) - input(i - 1, j)) * inv_2dx;
140
141 for (size_t j = 0; j < input.cols; ++j) {
142 output(0, j) = K(0);
143 output(input.rows - 1, j) = K(0);
144 }
145 }
146
147 else {
148 std::cerr << "[centered_derivative] Invalid axis: must be 0 or 1\n";
149 }
150 }
159 __attribute__((always_inline, hot, flatten)) inline void
162 using reg = typename Simd::reg;
163 constexpr size_t W = Simd::width;
164
165 const K inv_12dx = K(1) / (K(12) * dx);
166 const reg inv = Simd::set1(inv_12dx);
167 const reg eight = Simd::set1(8.0f);
168
169 if (axis == 1) {
170#pragma omp parallel for
171 for (size_t i = 0; i < input.rows; ++i) {
172 output(i, 0) = K(0);
173 output(i, 1) = K(0);
174
175 size_t j = 2;
176 for (; j + W <= input.cols - 2; j += W) {
177 const K *ptr_m2 = &input(i, j - 2);
178 const K *ptr_m1 = &input(i, j - 1);
179 const K *ptr_p1 = &input(i, j + 1);
180 const K *ptr_p2 = &input(i, j + 2);
181
182 reg fm2 = Simd::loadu(ptr_m2);
183 reg fm1 = Simd::loadu(ptr_m1);
184 reg fp1 = Simd::loadu(ptr_p1);
185 reg fp2 = Simd::loadu(ptr_p2);
186
187 reg term1 = Simd::mul(fp1, eight);
188 reg term2 = Simd::mul(fp2, Simd::set1(1.0f));
189 reg term3 = Simd::mul(fm1, eight);
190
191 reg num = Simd::sub(Simd::add(fm2, term1), term2);
192 num = Simd::sub(num, term3);
193 reg res = Simd::mul(num, inv);
194
195 Simd::storeu(&output(i, j), res);
196 }
197
198 for (; j < input.cols - 2; ++j) {
199 output(i, j) = (input(i, j - 2) - 8 * input(i, j - 1) + 8 * input(i, j + 1) -
200 input(i, j + 2)) *
201 inv_12dx;
202 }
203
204 output(i, input.cols - 2) = K(0);
205 output(i, input.cols - 1) = K(0);
206 }
207 } else if (axis == 0) {
208#pragma omp parallel for
209 for (size_t j = 0; j < input.cols; ++j) {
210 output(0, j) = K(0);
211 output(1, j) = K(0);
212
213 size_t i = 2;
214 for (; i + W <= input.rows - 2; i += W) {
215 const K *ptr_m2 = &input(i - 2, j);
216 const K *ptr_m1 = &input(i - 1, j);
217 const K *ptr_p1 = &input(i + 1, j);
218 const K *ptr_p2 = &input(i + 2, j);
219
220 reg fm2 = Simd::loadu(ptr_m2);
221 reg fm1 = Simd::loadu(ptr_m1);
222 reg fp1 = Simd::loadu(ptr_p1);
223 reg fp2 = Simd::loadu(ptr_p2);
224
225 reg term1 = Simd::mul(fp1, eight);
226 reg term2 = Simd::mul(fp2, Simd::set1(1.0f));
227 reg term3 = Simd::mul(fm1, eight);
228
229 reg num = Simd::sub(Simd::add(fm2, term1), term2);
230 num = Simd::sub(num, term3);
231 reg res = Simd::mul(num, inv);
232
233 Simd::storeu(&output(i, j), res);
234 }
235
236 for (; i < input.rows - 2; ++i) {
237 output(i, j) = (input(i - 2, j) - 8 * input(i - 1, j) + 8 * input(i + 1, j) -
238 input(i + 2, j)) *
239 inv_12dx;
240 }
241
242 output(input.rows - 2, j) = K(0);
243 output(input.rows - 1, j) = K(0);
244 }
245 } else {
246 std::cerr << "[SIMD Order4] Invalid axis: must be 0 or 1\n";
247 }
248 }
249};
250
257template <typename K, size_t Rank> class DerivateND {
258 public:
259 std::array<size_t, Rank> shape;
267 DerivateND(const std::array<size_t, Rank> &dims)
268 : shape(dims),
269 data(std::accumulate(dims.begin(), dims.end(), size_t(1), std::multiplies<size_t>()),
270 K()),
272 std::cout << "Auto-selected BLOCK_SIZE = " << block_size << std::endl;
273 }
274
275 inline size_t flatten_index(const std::array<size_t, Rank> &indices) const {
276 size_t index = 0, stride = 1;
277 for (int i = Rank - 1; i >= 0; --i) {
278 index += indices[i] * stride;
279 stride *= shape[i];
280 }
281 return index;
282 }
289 inline K &operator()(const std::array<size_t, Rank> &indices) {
290 return data[flatten_index(indices)];
291 }
298 inline const K &operator()(const std::array<size_t, Rank> &indices) const {
299 return data[flatten_index(indices)];
300 }
306 inline size_t size() const { return data.size(); }
315 __attribute__((always_inline, hot, flatten)) inline void
317 K dx) const {
319 using reg = typename Simd::reg;
320 const size_t simd_width = Simd::width;
321
322 const auto &shape = input.shape;
323 const size_t total = input.size();
324 const K inv_2dx = K(1) / (K(2) * dx);
325 const reg inv2dx = Simd::set1(inv_2dx);
326
327 std::array<size_t, Rank> strides;
328 strides[Rank - 1] = 1;
329 for (int i = Rank - 2; i >= 0; --i)
330 strides[i] = strides[i + 1] * shape[i + 1];
331
332 const size_t stride_axis = strides[axis];
333 const size_t dim_axis = shape[axis];
334
335#pragma omp parallel for schedule(static)
336 for (size_t flat = 0; flat < total; flat += simd_width) {
337 bool safe = true;
338
339 for (size_t offset = 0; offset < simd_width; ++offset) {
340 if (flat + offset >= total) {
341 safe = false;
342 break;
343 }
344
345 const size_t coord_axis = ((flat + offset) / stride_axis) % dim_axis;
346 if (coord_axis == 0 || coord_axis >= dim_axis - 1) {
347 safe = false;
348 break;
349 }
350 }
351
352 if (!safe) {
353 for (size_t offset = 0; offset < simd_width && flat + offset < total; ++offset) {
354 const size_t f = flat + offset;
355 const size_t coord_axis = (f / stride_axis) % dim_axis;
356
357 if (coord_axis == 0 || coord_axis >= dim_axis - 1) {
358 output.data[f] = K(0);
359 continue;
360 }
361 output.data[f] =
362 (input.data[f + stride_axis] - input.data[f - stride_axis]) * inv_2dx;
363 }
364 continue;
365 }
366
367 const K *ptr_fwd = input.data.data() + flat + stride_axis;
368 const K *ptr_back = input.data.data() + flat - stride_axis;
369 K *out_ptr = output.data.data() + flat;
370
371 reg forward = Simd::loadu(ptr_fwd);
372 reg backward = Simd::loadu(ptr_back);
373 reg diff = Simd::sub(forward, backward);
374 reg result = Simd::mul(diff, inv2dx);
375 Simd::storeu(out_ptr, result);
376 }
377 }
386 __attribute__((always_inline, hot, flatten)) inline void
387 centered_derivative_order4_rank(const DerivateND<K, Rank> &input, DerivateND<K, Rank> &output,
388 size_t axis, K dx) const {
390 using reg = typename Simd::reg;
391 constexpr size_t W = Simd::width;
392
393 const auto &shape = input.shape;
394 const size_t total = input.size();
395 const K inv_12dx = K(1) / (K(12) * dx);
396 const reg inv = Simd::set1(inv_12dx);
397
398 std::array<size_t, Rank> strides;
399 strides[Rank - 1] = 1;
400 for (int i = Rank - 2; i >= 0; --i)
401 strides[i] = strides[i + 1] * shape[i + 1];
402
403 const size_t stride_axis = strides[axis];
404 const size_t dim_axis = shape[axis];
405
406#pragma omp parallel for schedule(static)
407 for (size_t flat = 0; flat < total; flat += W) {
408 bool safe = true;
409
410 for (size_t offset = 0; offset < W; ++offset) {
411 if (flat + offset >= total) {
412 safe = false;
413 break;
414 }
415
416 const size_t coord_axis = ((flat + offset) / stride_axis) % dim_axis;
417 if (coord_axis < 2 || coord_axis >= dim_axis - 2) {
418 safe = false;
419 break;
420 }
421 }
422
423 if (!safe) {
424 for (size_t offset = 0; offset < W && flat + offset < total; ++offset) {
425 size_t f = flat + offset;
426 size_t coord_axis = (f / stride_axis) % dim_axis;
427 if (coord_axis < 2 || coord_axis >= dim_axis - 2) {
428 output.data[f] = K(0);
429 continue;
430 }
431
432 K fm2 = input.data[f - 2 * stride_axis];
433 K fm1 = input.data[f - stride_axis];
434 K fp1 = input.data[f + stride_axis];
435 K fp2 = input.data[f + 2 * stride_axis];
436
437 output.data[f] = (-fp2 + 8 * fp1 - 8 * fm1 + fm2) * inv_12dx;
438 }
439 continue;
440 }
441
442 const K *ptr_m2 = input.data.data() + flat - 2 * stride_axis;
443 const K *ptr_m1 = input.data.data() + flat - stride_axis;
444 const K *ptr_p1 = input.data.data() + flat + stride_axis;
445 const K *ptr_p2 = input.data.data() + flat + 2 * stride_axis;
446 K *out_ptr = output.data.data() + flat;
447
448 reg fm2 = Simd::loadu(ptr_m2);
449 reg fm1 = Simd::loadu(ptr_m1);
450 reg fp1 = Simd::loadu(ptr_p1);
451 reg fp2 = Simd::loadu(ptr_p2);
452
453 reg num = Simd::add(
454 fm2, Simd::sub(Simd::mul(fp1, Simd::set1(8)), Simd::mul(fp2, Simd::set1(1))));
455 num = Simd::sub(num, Simd::mul(fm1, Simd::set1(8)));
456 reg res = Simd::mul(num, inv);
457
458 Simd::storeu(out_ptr, res);
459 }
460 }
461};
473template <typename Container>
474inline Container richardson_derivative_container(const Container &plus_h, const Container &minus_h,
475 const Container &plus_half_h,
476 const Container &minus_half_h, double h) {
477 assert(plus_h.size() == minus_h.size());
478 assert(plus_h.size() == plus_half_h.size());
479 assert(plus_h.size() == minus_half_h.size());
480
481 Container out(plus_h);
482#pragma omp parallel for
483 for (size_t i = 0; i < plus_h.size(); ++i) {
484 auto diff_h = (plus_h[i] - minus_h[i]) / (2.0 * h);
485 auto diff_half = (plus_half_h[i] - minus_half_h[i]) / h;
486 out[i] = (4.0 * diff_half - diff_h) / 3.0;
487 }
488 return out;
489}
501template <typename T>
502inline T richardson_derivative(const T &plus_h, const T &minus_h, const T &plus_half_h,
503 const T &minus_half_h, double h) {
504 T diff_h = (plus_h - minus_h) / (2.0 * h);
505 T diff_half = (plus_half_h - minus_half_h) / h;
506 return (4.0 * diff_half - diff_h) / 3.0;
507}
508
509} // namespace tensorium
std::vector< K, AlignedAllocator< K, ALIGN > > aligned_vector
Type alias for a std::vector with aligned memory allocation.
Definition Allocator.hpp:111
size_t detect_optimal_block_size()
Definition CPU_id.hpp:18
A multi-dimensional aligned tensor for numerical derivatives.
Definition Derivate.hpp:257
__attribute__((always_inline, hot, flatten)) inline void centered_derivative(const DerivateND< K
Compute second-order centered derivative along an axis.
size_t block_size
Definition Derivate.hpp:261
DerivateND(const std::array< size_t, Rank > &dims)
Construct a tensor with a given shape.
Definition Derivate.hpp:267
std::array< size_t, Rank > strides
Definition Derivate.hpp:327
const size_t dim_axis
Definition Derivate.hpp:333
__attribute__((always_inline, hot, flatten)) inline void centered_derivative_order4_rank(const DerivateND< K
Compute fourth-order centered derivative along an axis.
size_t size() const
Total number of elements in the tensor.
Definition Derivate.hpp:306
const K inv_12dx
Definition Derivate.hpp:395
const size_t total
Definition Derivate.hpp:323
const K & operator()(const std::array< size_t, Rank > &indices) const
Const access to a tensor element.
Definition Derivate.hpp:298
const reg inv2dx
Definition Derivate.hpp:325
const size_t stride_axis
Definition Derivate.hpp:332
const size_t simd_width
Definition Derivate.hpp:320
aligned_vector< K > data
Definition Derivate.hpp:260
Rank DerivateND< K, Rank > & output
Definition Derivate.hpp:316
Rank & input
Definition Derivate.hpp:316
Rank DerivateND< K, Rank > size_t axis
Definition Derivate.hpp:316
K & operator()(const std::array< size_t, Rank > &indices)
Mutable access to a tensor element.
Definition Derivate.hpp:289
const K inv_2dx
Definition Derivate.hpp:324
size_t flatten_index(const std::array< size_t, Rank > &indices) const
Definition Derivate.hpp:275
std::array< size_t, Rank > shape
Definition Derivate.hpp:259
const reg inv
Definition Derivate.hpp:396
constexpr size_t W
Definition Derivate.hpp:391
typename Simd::reg reg
Definition Derivate.hpp:319
A 2D aligned matrix for numerical derivatives.
Definition Derivate.hpp:30
typename Simd::reg reg
Definition Derivate.hpp:101
size_t block_size
Definition Derivate.hpp:34
const K inv_12dx
Definition Derivate.hpp:165
const size_t simd_width
Definition Derivate.hpp:102
size_t size() const
Total number of elements in the matrix.
Definition Derivate.hpp:89
constexpr size_t W
Definition Derivate.hpp:163
Derivate< K > size_t axis
Definition Derivate.hpp:99
Derivate(size_t r, size_t c)
Constructor with explicit dimensions.
Definition Derivate.hpp:41
K & operator()(size_t i, size_t j)
Mutable access to an element.
Definition Derivate.hpp:65
size_t rows
Definition Derivate.hpp:32
Derivate< K > size_t K dx
Definition Derivate.hpp:160
Derivate< K > & output
Definition Derivate.hpp:99
size_t cols
Definition Derivate.hpp:32
const K inv_2dx
Definition Derivate.hpp:103
__attribute__((always_inline, hot, flatten)) inline void centered_derivative(const Derivate< K > &input
Compute second-order centered derivative.
const reg eight
Definition Derivate.hpp:167
const K & operator()(size_t i, size_t j) const
Const access to an element.
Definition Derivate.hpp:80
Derivate(const Matrix< K > &m)
Construct from an existing matrix.
Definition Derivate.hpp:53
const reg inv
Definition Derivate.hpp:166
__attribute__((always_inline, hot, flatten)) inline void centered_derivative_order4(const Derivate< K > &input
Compute fourth-order centered derivative.
aligned_vector< K > data
Definition Derivate.hpp:33
High-performance aligned matrix class with SIMD support.
Definition Matrix.hpp:27
Definition Derivate.hpp:24
Container richardson_derivative_container(const Container &plus_h, const Container &minus_h, const Container &plus_half_h, const Container &minus_half_h, double h)
Richardson extrapolation for vectors or containers.
Definition Derivate.hpp:474
T richardson_derivative(const T &plus_h, const T &minus_h, const T &plus_half_h, const T &minus_half_h, double h)
Richardson extrapolation for scalar values.
Definition Derivate.hpp:502
void centered_derivative(const Derivate< K > &input, Derivate< K > &output, size_t axis, K dx)
Definition Functional.hpp:349
void centered_derivative_order4(const Derivate< K > &input, Derivate< K > &output, size_t axis, K dx)
Definition Functional.hpp:355
Definition SIMD.hpp:177