For anyone taking the OS exam this wednesday, here are some naive vs. SIMD (AVX) function examples which we wrote during a revision session.

Also, the function used to solve the minimum of each group of eight using SIMD then using a naive approach to collate the sub-results. This has now been changed so that results for each group are copied into a float8 buffer as they are computed. The function then loops back on itself to reduce these values in turn until there is only one value remaining.

The only issue with the summing function is that for large length arrays so much copying and combining of the data occurs that floating-point precision errors become apparent sooner than with the naive O(n) function.

```
#include <iostream>
#include <intrin.h>
#include <cmath>
#include <ctime>
#include <random>
#include "Chrono.h"
// An 8 wide float
#define float8 __m256
// Set 1 value to all 8 lanes
#define set1(a) _mm256_set1_ps((a))
// Add
#define add8(a,b) _mm256_add_ps((a),(b))
// Min
#define min8(a,b) _mm256_min_ps((a),(b))
// Shuffles
#define permute8(a,b,c) _mm256_permute2f128_ps((a),(b),(c))
#define blend8(a,b,c) _mm256_blend_ps((a),(b),(c))
#define shuffle8(a,b,c) _mm256_shuffle_ps((a),(b),(c))
// Length of test arrays 2^22
#define LEN 4194304
// Padding for aligned arrays
#define BUFFER 8
using namespace std;
/* HELPERS */
// For a given pointer, return the 32 bit aligned version
float* alignMem(float *ptr) {
float *aptr = ptr;
int align = (((unsigned long long) ptr) & 31) / 4;
aptr += 8 - align;
return aptr;
};
// Make an aligned array of the given length.
// The randomize parameter fills the array with
// random values.
float* makeArray(int len, bool randomize = true) {
float *list = new float[len + BUFFER];
float *alist = alignMem(list);
for (int i = 0; i < len && randomize; i++) {
alist[i] = (float) (rand() % 1000) / 1000.0f;
}
return alist;
};
// Prints a float array to the console
void printArray(float *list, int len) {
cout << "[ ";
for (int i = 0; i < len; i++) {
cout << list[i] << " ";
}
cout << " ]" << endl;
};
/* EXPERIMENTS */
// Take two float arrays, a & b, and sum them into a third array, r
void addVal(float *list1, float *list2, float *res, int len) {
for (int i = 0; i < len; i++) {
res[i] = list1[i] + list2[i];
}
};
// Same as above, but using AVX to chunk the workload by 8
void addValAVX(float *list1, float *list2, float *res, int len) {
for (int i = 0; i < len; i += 8) {
// Get address of start of each float8 group
float8 *a = (float8*) &list1[i];
float8 *b = (float8*) &list2[i];
float8 *r = (float8*) &res[i];
// Sum the groups of 8 into the result group
*r = add8(*a, *b);
}
};
// Take two float array and interleave their values so the result
// array contains even indexed elements from a, and odd indexed elements from b
void blendVal(float *list1, float *list2, float *res, int len) {
for (int i = 0; i < len; i++) {
res[i] = (i % 2 == 0) ? list1[i] : list2[i];
}
};
// Same as above, but using AVX to chunk the workload by groups of 8,
// and by using the 'blend' command to choose each element without an
// 'if-else' condition
void blendValAVX(float *list1, float *list2, float *res, int len) {
// Mask. A 0 means from list1, a 1 means from list 2
// You have to convert your binary mask to hex to compile it!
const int mask = 0xAA; // 0b10101010
for (int i = 0; i < len; i += 8) {
// Get address of start of each float8 group
float8 *a = (float8*) &list1[i];
float8 *b = (float8*) &list2[i];
float8 *r = (float8*) &res[i];
// Select from a & b using the mask
*r = blend8(*a, *b, mask);
}
};
// Like above, take two float arrays, but this time take the first two values
// from list a, then the second two from list b, then the third two from list a,
// and so on...
void shuffleVal(float *list1, float *list2, float *res, int len) {
for (int i = 0; i < len; i++) {
// This results in a stream of true/false
// TT FF TT FF TT FF ... repeating
if ((i / 2) % 2 == 0) {
res[i] = list1[i];
}
else
{
res[i] = list2[i];
}
}
};
// Does the above shuffle using AVX to grab each pair of numbers
void shuffleValAVX(float *list1, float *list2, float *res, int len) {
// Mask. Grab the first and second elements from a,
// then the third and fourth elements from b
const int mask = 0xE4; // 0b 11 10 01 00
for (int i = 0; i < len; i += 8) {
// Get address of start of each float8 group
float8 *a = (float8*) &list1[i];
float8 *b = (float8*) &list2[i];
float8 *r = (float8*) &res[i];
// Select from a and b using the mask
*r = shuffle8(*a, *b, mask);
}
};
// For an array of floats, compute the minumum value
float minVal(float *list1, int len) {
// Start with the highest number possibly in the list
float m = (float) RAND_MAX;
for (int i = 0; i < len; i++) {
// For each value check if it is less than the
// current minimum, and replace it if it is
if (m > list1[i]) m = list1[i];
}
return m;
};
// Find the minimum value in the array, this time using AVX
// to chunk the workload into groups of 8 which are solved
// in parallel
float minValAVX(float *list, int len) {
const int shufflemask = 0xB1; // 0b 10 11 00 01
const int shufflemask2 = 0x4E; // 0b 01 00 11 10
const int permutemask = 0x21; // 0b 00 10 00 01
// Creat buffer for results
int sublen = len / 8;
float *sub = new float[sublen + BUFFER];
float *sublist = alignMem(sub);
float8 res, b;
float *res0 = (float*) &res;
bool reduce = true;
do {
if (len < 8) {
memset(&sublist[len], 0, (8 - len) * sizeof(float));
reduce = false;
}
// Loop over all float8's in the list
int j = 0;
for (int i = 0; i < len; i += 8) {
// Grab the next float8 and compute the sum
res = *(float8*) &list[i];
// Shuffle 1 & find sum
b = shuffle8(res, res, shufflemask);
res = min8(res, b);
// Shuffle 2 & find sum
b = shuffle8(res, res, shufflemask2);
res = min8(res, b);
// Permute & find sum
b = permute8(res, res, permutemask);
res = min8(res, b);
// Chuck the result into the next index of the result array
sublist[j] = *res0;
j++;
}
len = j;
list = sublist;
} while (reduce);
float m = sublist[0];
delete[] sub;
return m;
};
// For an array of floats, compute the sum
float sumVal(float *list1, int len) {
float sum = 0;
for (int i = 0; i < len; i++) {
sum += list1[i];
}
return sum;
};
// Find the sum of the array, this time using AVX
// to chunk the workload into groups of 8 which are solved
// in parallel
float sumValAVX(float *list, int len) {
const int shufflemask = 0xB1; // 0b 10 11 00 01
const int shufflemask2 = 0x4E; // 0b 01 00 11 10
const int permutemask = 0x21; // 0b 00 10 00 01
// Creat buffer for results
int sublen = len / 8;
float *sub = new float[sublen + BUFFER];
float *sublist = alignMem(sub);
float8 res, b;
float *res0 = (float*) &res;
bool reduce = true;
do {
if (len < 8) {
memset(&list[len], 0, ((8 - len) + 1) * sizeof(float));
reduce = false;
}
// Loop over all float8's in the list
int j = 0;
for (int i = 0; i < len; i += 8) {
// Grab the next float8 and compute the sum
res = *(float8*) &list[i];
// Shuffle 1 & find sum
b = shuffle8(res, res, shufflemask);
res = add8(res, b);
// Shuffle 2 & find sum
b = shuffle8(res, res, shufflemask2);
res = add8(res, b);
// Permute & find sum
b = permute8(res, res, permutemask);
res = add8(res, b);
// Chuck the result into the next index of the result array
sublist[j] = *res0;
j++;
}
len = j;
list = sublist;
} while (reduce);
float sum = sublist[0];
delete [] sub;
return sum;
};
/* RUN */
int main() {
srand(time(NULL));
// Make some random data to play with...
cout << "Array 1" << endl;
float *list1 = makeArray(LEN);
//printArray(list1, LEN);
cout << "Array 2" << endl;
float *list2 = makeArray(LEN);
//printArray(list2, LEN);
float *res = makeArray(LEN, false);
// Start the clock!
Chrono c;
/* Test the Add function! */
cout << endl << "Add Test" << endl;
addVal(list1, list2, res, LEN);
//cout << " "; printArray(res, LEN);
c.PrintElapsedTime_us("\n Time: ");
addValAVX(list1, list2, res, LEN);
//cout << "AVX "; printArray(res, LEN);
c.PrintElapsedTime_us("\nAVX Time: ");
/* Test the Blend function! */
cout << endl << "Blend Test" << endl;
blendVal(list1, list2, res, LEN);
//cout << " "; printArray(res, LEN);
c.PrintElapsedTime_us("\n Time: ");
blendValAVX(list1, list2, res, LEN);
//cout << "AVX "; printArray(res, LEN);
c.PrintElapsedTime_us("\nAVX Time: ");
/* Test the Shuffle function! */
cout << endl << "Shuffle Test" << endl;
shuffleVal(list1, list2, res, LEN);
//cout << " "; printArray(res, LEN);
c.PrintElapsedTime_us("\n Time: ");
shuffleValAVX(list1, list2, res, LEN);
//cout << "AVX "; printArray(res, LEN);
c.PrintElapsedTime_us("\nAVX Time: ");
/* Test the Min function! */
cout << endl << "Min Test" << endl;
cout << "\n " << minVal(list1, LEN) << endl;
c.PrintElapsedTime_us("\n Time: ");
cout << "\nAVX " << minValAVX(list1, LEN) << endl;
c.PrintElapsedTime_us("\nAVX Time: ");
/* Test the Sum function! */
cout << endl << "Sum Test" << endl;
cout << "\n " << sumVal(list1, LEN) << endl;
c.PrintElapsedTime_us("\n Time: ");
cout << "\nAVX " << sumValAVX(list1, LEN) << endl;
c.PrintElapsedTime_us("\nAVX Time: ");
cout << endl << "Len: " << LEN << endl;
system("PAUSE");
};
```