###### Optimize SIMD code with vectorization-friendly data layout

As you have learned, changing data layout can improve performance.

The challenge is knowing what to change and understanding why it would matter. Another version of the program is shown below. You can study this version to learn about data layout.

Use a text editor to copy the code below and save it as `simulation4.c`:

```    ```

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#include <unistd.h>

#include <arm_neon.h>

#define N           100000 // number of particles
#define SECONDS     100    // duration in seconds
#define STEPSPERSEC 1000   // steps per second or time resolution

// The object struct
typedef struct object {
uint32_t id;
float weight;
void *model_data;
} object_t;

struct object_list {
object_t o[N];
float x[N];
float y[N];
float z[N];
float vx[N];
float vy[N];
float vz[N];
};

// Helper function to return a random float number from 0.0f to a.
float randf(float a) {
return ((float)rand()/(float)(RAND_MAX)) * a;
}

const float box_hi[4] = {  10.0f,  10.0f,  10.0f,  10.0f };

void init_objects(struct object_list *objects) {
// Give initial speeds and positions
for (size_t i=0; i < N; i++) {
objects->o[i].id = i;
objects->o[i].weight = randf(2.0f);
// initial positions in box [-10, 10], velocity in [-1, 1]
objects->x[i] = randf(2.0f * box_hi[0]) - box_hi[0];
objects->y[i] = randf(2.0f * box_hi[1]) - box_hi[1];
objects->z[i] = randf(2.0f * box_hi[2]) - box_hi[2];
objects->vx[i] = randf(2.0) - 1.0f;
//printf("vx[%ld] = %f\n", i, objects->vx[i]);
objects->vy[i] = randf(2.0) - 1.0f;
objects->vz[i] = randf(2.0) - 1.0f;
}
}

void simulate_objects(struct object_list *objects, float duration, float step) {

float current_time = 0.0f;
//size_t iterations = 0;

float32x4_t s = vdupq_n_f32(step);
uint32x4_t one = vdupq_n_u32(1);
// Collision counters are now per axis
uint32x4_t ctr_x = vdupq_n_u32(0);
uint32x4_t ctr_y = vdupq_n_u32(0);
uint32x4_t ctr_z = vdupq_n_u32(0);

float32x4_t box_hi_v = vld1q_f32(box_hi);
float32x4_t box_lo_v = vnegq_f32(box_hi_v);

while (current_time < duration) {
// Move 4 objects in each iteration
for (size_t i=0; i < N; i+= 4) {
float32x4_t x_v, y_v, z_v, vx_v, vy_v, vz_v;

// Load the x,y,z and vx, vy, vz coords for 4 objects
x_v = vld1q_f32(&objects->x[i]);
y_v = vld1q_f32(&objects->y[i]);
z_v = vld1q_f32(&objects->z[i]);
vx_v = vld1q_f32(&objects->vx[i]);
vy_v = vld1q_f32(&objects->vy[i]);
vz_v = vld1q_f32(&objects->vz[i]);

// Move the 4 objects in each axis
x_v = vmlaq_f32(x_v, vx_v, s);
y_v = vmlaq_f32(y_v, vy_v, s);
z_v = vmlaq_f32(z_v, vz_v, s);
vst1q_f32(&objects->x[i], x_v);
vst1q_f32(&objects->y[i], y_v);
vst1q_f32(&objects->z[i], z_v);

// Calculate the gt/lt masks for each axis

// Similarly, calculate the collision masks for each axis

// Calculate the negative velocities for each axis for the 4 objects
float32x4_t neg_vx_v = vnegq_f32(vx_v);
float32x4_t neg_vy_v = vnegq_f32(vy_v);
float32x4_t neg_vz_v = vnegq_f32(vz_v);
// Select the proper values of vx, vy, vz based on the collision masks
vst1q_f32(&objects->vx[i], vx_v);
vst1q_f32(&objects->vy[i], vy_v);
vst1q_f32(&objects->vz[i], vz_v);

// Increase the collision counters per axis
}
current_time += step;
}

// The counters we calculated are in 4 elements for each axis
// we need to do a horizontal addition to get the final result
uint32_t collisions[3];
printf("Total border collisions: x: %d, y: %d, z: %d\n", collisions[0], collisions[1], collisions[2]);
}

int main() {
struct object_list objects;

init_objects(&objects);

const float duration = SECONDS;
const float step = 1.0f/STEPSPERSEC;
struct timeval th_time_start, th_time_end;

gettimeofday(&th_time_start, NULL);
simulate_objects(&objects, duration, step);
gettimeofday(&th_time_end, NULL);

double elapsed;
elapsed = (th_time_end.tv_sec - th_time_start.tv_sec); // sec
elapsed += (th_time_end.tv_usec - th_time_start.tv_usec) / 1000000.0; // us to sec

printf("elapsed time: %f\n", elapsed);
}

```
```

Compile and run the new program as before:

```    ```

gcc -O3 -Wall simulation4.c -o simulation4
./simulation4

```
```

The output printed is:

```    ```

Total border collisions: x: 250123, y: 249711, z: 249844
elapsed time: 16.655471

```
```

For comparison, `clang-15` produces this output:

```    ```

Total border collisions: x: 250123, y: 249711, z: 249844
elapsed time: 13.530677

```
```

The first observation is that the number of collisions reported is the same (which is a good thing, performance optimization is useless if you get the wrong results)!

You also see a significant speed gain: the new version is up to 2.25x faster than the original version and about 1.43x faster (for GCC 12) as the previous version, even the hand-written version in the previous section.

Apart from small differences between the compilers, the important difference is due to the change in the data layout.

Review what has been changed.

First, you will notice that the coordinates `x`,`y`,`z` are not part of the original `object` structure anymore, similarly for the velocity values `vx`,`vy`, `vz`.

Instead, there is now an array of `N` elements for each of those variables.

This means that you can now process 4 consecutive `x` elements in each iteration, 4 `y` elements, 4 `z` elements. This results in processing 4 objects per iteration, thereby reducing the iterations to `N/4`.

There is no single `| x | y | z | (unused) |` vector that wastes 25% of storage per instruction.

This example demonstrates two paradigms used in data layout. The previous sections use Array of Structs (AoS). This sections switches to Struct of Arrays (SoA).

The second one is generally more performant in SIMD operations.

A few points to consider:

• There is no waste of storage compared to the previous implementations (only 3 of the 4 float elements were used for the 3D coordinates x, y, z)
• About 1/4 of the original `N` iterations are executed, which means 1/4 of the branches
• Many more calculations are done per iteration, which is good as it keeps the CPU pipeline full

Here are some rules for optimal performance with SIMD/vectorized code:

• Prefer fewer iterations with more calculations per iteration (keep the pipeline full with fewer branches)
• Data should be consecutive, prefer struct of arrays instead of arrays of structs
• Try to keep the data as packed as possible, no wasted elements in the vectors

So far you have been using Neon/ASIMD instructions, but newer Arm processors also offer the Scalable Vector Extension (SVE).

Proceed to the next section to find out how to use SVE and compare the performance with NEON.

Back
Next