This example computes the dot product of two vectors using Arm C Language Extensions (ACLE) intrinsics.

The intention is to enable the compiler to use SVE instructions in the specialized case, while restricting it to use only Armv8 instructions in the default case.

More details on the default implementation can be found in Implement dot product of two vectors .

Use a text editor to create a file named dotprod.c with the code below:

    

        
        
            #include <stdio.h>
#include <stdlib.h>
#include <arm_neon.h>
#include <arm_sve.h>

__attribute__((target_version("sve")))
int dotProduct(short *vec1, short* vec2, short len) {
  printf("Running the sve version of dotProduct\n");

  int i = 0;
  svbool_t pg = svwhilelt_b32(i, len);
  svbool_t pt = svptrue_b32();
  svint32_t res = svdup_s32(0);
  while (svptest_any(pt, pg)) {
    svint32_t sv1 = svld1sh_s32(pg, vec1 + i);
    svint32_t sv2 = svld1sh_s32(pg, vec2 + i);
    res = svmla_m(pg, res, sv1, sv2);
    i += svcntw();
    pg = svwhilelt_b32(i, len);
  }
  return (int) svaddv(pt, res);
}

__attribute__((target_version("default")))
int dotProduct(short* vec1, short* vec2, short len) {
  printf("Running the default version of dotProduct\n");

  const short transferSize = 4;
  short segments = len / transferSize;

  // 4-element vector of zeros
  int32x4_t partialSumsNeon = vdupq_n_s32(0);
  int32x4_t sum1 = vdupq_n_s32(0);
  int32x4_t sum2 = vdupq_n_s32(0);
  int32x4_t sum3 = vdupq_n_s32(0);
  int32x4_t sum4 = vdupq_n_s32(0);

  // Main loop (note that loop index goes through segments). Unroll with 4
  int i = 0;
  for(; i+3 < segments; i+=4) {
    // Load vector elements to registers
    int16x8_t v11 = vld1q_s16(vec1);
    int16x4_t v11_low = vget_low_s16(v11);
    int16x4_t v11_high = vget_high_s16(v11);

    int16x8_t v12 = vld1q_s16(vec2);
    int16x4_t v12_low = vget_low_s16(v12);
    int16x4_t v12_high = vget_high_s16(v12);

    int16x8_t v21 = vld1q_s16(vec1+8);
    int16x4_t v21_low = vget_low_s16(v21);
    int16x4_t v21_high = vget_high_s16(v21);

    int16x8_t v22 = vld1q_s16(vec2+8);
    int16x4_t v22_low = vget_low_s16(v22);
    int16x4_t v22_high = vget_high_s16(v22);

    // Multiply and accumulate: partialSumsNeon += vec1Neon * vec2Neon
    sum1 = vmlal_s16(sum1, v11_low, v12_low);
    sum2 = vmlal_s16(sum2, v11_high, v12_high);
    sum3 = vmlal_s16(sum3, v21_low, v22_low);
    sum4 = vmlal_s16(sum4, v21_high, v22_high);

    vec1 += 16;
    vec2 += 16;
  }
  partialSumsNeon = sum1 + sum2 + sum3 + sum4;

  // Sum up remain parts
  int remain = len % transferSize;
  for(i = 0; i < remain; i++) {
    int16x4_t vec1Neon = vld1_s16(vec1);
    int16x4_t vec2Neon = vld1_s16(vec2);
    partialSumsNeon = vmlal_s16(partialSumsNeon, vec1Neon, vec2Neon);
    vec1 += 4;
    vec2 += 4;
  }

  // Store partial sums
  int partialSums[transferSize];
  vst1q_s32(partialSums, partialSumsNeon);

  // Sum up partial sums
  int result = 0;
  for(i = 0; i < transferSize; i++)
    result += partialSums[i];

  return result;
}

int scanVector(short *vec, short len) {
  short *p = vec;
  for (int i = 0; i < len; i++, p++)
    if (scanf("%hd", p) != 1)
      return 0;
  return 1;
}

int main(int argc, char **argv) {
  if (argc == 2) {
    int n = atoi(argv[1]);
    if (n < 16 || n > 1024)
      return -1;
    short v1[1024];
    short v2[1024];
    if (!scanVector(v1, n) || !scanVector(v2, n))
      return -1;
    int result = dotProduct(v1, v2, n);
    printf("dotProduct = %d\n", result);
  }
  return 0;
}
        
    

You can compile and run the above example on hardware that has both SVE and Armv8 instructions (no SVE):

To compile with Clang, run:

    

        
        
            clang --target=aarch64-linux-gnu -march=armv8-a -O3 dotprod.c --rtlib=compiler-rt
        
    

To compile with GCC, use:

    

        
        
            g++ -march=armv8-a -O3 dotprod.c
        
    
Note

Note that gcc-14 does not yet support target_version when using the c-frontend. You can use g++-14 instead.

To run the application:

    

        
        
            echo 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 | ./a.out 16
        
    

The expected output is:

    

        
        Running the sve version of dotProduct
dotProduct = 32

        
    

The SVE version is being selected because it has higher priority than the default, as indicated by the mapping table .

Back
Next