What's wrong in this SSE2 transposition?
I'm trying to convert this code:
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))
phase += std::clamp(radiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
}
mPhase = phase;
in SSE2, trying to speed up the whole block (which is called often). I'm using MSVC with the Fast optimizazion flag, but auto-vectorization is very crap. Since I'm also learning vectorization, I find it a nice challenge.
So I've take the formula above, and simplified, such as:
radiansPerSampleBp0 = radiansPerSample * bp0;
phase += std::clamp(radiansPerSampleBp0 * pB[sampleIndex] + radiansPerSample * pC[sampleIndex]), 0.0, PI);
Which can be muted into a serial dependency such as:
phase[0] += (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[1] += (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1]) + (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[2] += (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2]) + (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1])
phase[3] += (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3]) + (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2])
phase[4] += (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4]) + (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3])
phase[5] += (radiansPerSampleBp0 * pB[5] + radiansPerSample * pC[5]) + (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4])
Hence, the code I did:
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))
v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);
v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
}
mPhase = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
But, unfortunately, after sum "steps", the results become very different for each phase value.
Tried to debug, but I'm not really able to find where the problem is.
Also, it's not really so "fast" rather than the old version.
Are you able to recognize the trouble? And how you will speed-up the code?
Here's the whole code, if you want to check the two different outputs:
#include <iostream>
#include <algorithm>
#include <immintrin.h>
#include <emmintrin.h>
#define PI 3.14159265358979323846
constexpr int voiceSize = 1;
constexpr int bufferSize = 256;
class Param
{
public:
alignas(16) double mPhase = 0.0;
alignas(16) double mPhaseOptimized = 0.0;
alignas(16) double mNoteFrequency = 10.0;
alignas(16) double mHostPitch = 1.0;
alignas(16) double mRadiansPerSample = 1.0;
alignas(16) double b[voiceSize][bufferSize];
alignas(16) double c[voiceSize][bufferSize];
Param() { }
inline void Process(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))
phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
std::cout << sampleIndex << ": " << phase << std::endl;
}
mPhase = phase;
}
inline void ProcessOptimized(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhaseOptimized;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))
v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);
v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
std::cout << sampleIndex << ": " << v_phase.m128d_f64[0] << std::endl;
std::cout << sampleIndex + 1 << ": " << v_phase.m128d_f64[1] << std::endl;
}
mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
}
};
class MyPlugin
{
public:
Param mParam1;
MyPlugin() {
// fill b
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = (sampleIndex / ((double)bufferSize - 1));
mParam1.b[voiceIndex][sampleIndex] = value;
}
}
// fill c
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = 0.0;
mParam1.c[voiceIndex][sampleIndex] = value;
}
}
}
~MyPlugin() { }
void Process(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.Process(voiceIndex, blockSize);
}
}
void ProcessOptimized(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.ProcessOptimized(voiceIndex, blockSize);
}
}
};
int main() {
MyPlugin myPlugin;
long long numProcessing = 1;
long long counterProcessing = 0;
// I'll only process once block, just for analysis
while (counterProcessing++ < numProcessing) {
// variable blockSize (i.e. it can vary, being even or odd)
int blockSize = 256;
// process data
myPlugin.Process(blockSize);
std::cout << "#########" << std::endl;
myPlugin.ProcessOptimized(blockSize);
}
}
c++ vectorization intrinsics sse2
add a comment |
I'm trying to convert this code:
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))
phase += std::clamp(radiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
}
mPhase = phase;
in SSE2, trying to speed up the whole block (which is called often). I'm using MSVC with the Fast optimizazion flag, but auto-vectorization is very crap. Since I'm also learning vectorization, I find it a nice challenge.
So I've take the formula above, and simplified, such as:
radiansPerSampleBp0 = radiansPerSample * bp0;
phase += std::clamp(radiansPerSampleBp0 * pB[sampleIndex] + radiansPerSample * pC[sampleIndex]), 0.0, PI);
Which can be muted into a serial dependency such as:
phase[0] += (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[1] += (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1]) + (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[2] += (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2]) + (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1])
phase[3] += (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3]) + (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2])
phase[4] += (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4]) + (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3])
phase[5] += (radiansPerSampleBp0 * pB[5] + radiansPerSample * pC[5]) + (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4])
Hence, the code I did:
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))
v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);
v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
}
mPhase = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
But, unfortunately, after sum "steps", the results become very different for each phase value.
Tried to debug, but I'm not really able to find where the problem is.
Also, it's not really so "fast" rather than the old version.
Are you able to recognize the trouble? And how you will speed-up the code?
Here's the whole code, if you want to check the two different outputs:
#include <iostream>
#include <algorithm>
#include <immintrin.h>
#include <emmintrin.h>
#define PI 3.14159265358979323846
constexpr int voiceSize = 1;
constexpr int bufferSize = 256;
class Param
{
public:
alignas(16) double mPhase = 0.0;
alignas(16) double mPhaseOptimized = 0.0;
alignas(16) double mNoteFrequency = 10.0;
alignas(16) double mHostPitch = 1.0;
alignas(16) double mRadiansPerSample = 1.0;
alignas(16) double b[voiceSize][bufferSize];
alignas(16) double c[voiceSize][bufferSize];
Param() { }
inline void Process(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))
phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
std::cout << sampleIndex << ": " << phase << std::endl;
}
mPhase = phase;
}
inline void ProcessOptimized(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhaseOptimized;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))
v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);
v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
std::cout << sampleIndex << ": " << v_phase.m128d_f64[0] << std::endl;
std::cout << sampleIndex + 1 << ": " << v_phase.m128d_f64[1] << std::endl;
}
mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
}
};
class MyPlugin
{
public:
Param mParam1;
MyPlugin() {
// fill b
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = (sampleIndex / ((double)bufferSize - 1));
mParam1.b[voiceIndex][sampleIndex] = value;
}
}
// fill c
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = 0.0;
mParam1.c[voiceIndex][sampleIndex] = value;
}
}
}
~MyPlugin() { }
void Process(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.Process(voiceIndex, blockSize);
}
}
void ProcessOptimized(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.ProcessOptimized(voiceIndex, blockSize);
}
}
};
int main() {
MyPlugin myPlugin;
long long numProcessing = 1;
long long counterProcessing = 0;
// I'll only process once block, just for analysis
while (counterProcessing++ < numProcessing) {
// variable blockSize (i.e. it can vary, being even or odd)
int blockSize = 256;
// process data
myPlugin.Process(blockSize);
std::cout << "#########" << std::endl;
myPlugin.ProcessOptimized(blockSize);
}
}
c++ vectorization intrinsics sse2
Comments are not for extended discussion; this conversation has been moved to chat.
– Samuel Liew♦
Jan 3 at 23:12
add a comment |
I'm trying to convert this code:
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))
phase += std::clamp(radiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
}
mPhase = phase;
in SSE2, trying to speed up the whole block (which is called often). I'm using MSVC with the Fast optimizazion flag, but auto-vectorization is very crap. Since I'm also learning vectorization, I find it a nice challenge.
So I've take the formula above, and simplified, such as:
radiansPerSampleBp0 = radiansPerSample * bp0;
phase += std::clamp(radiansPerSampleBp0 * pB[sampleIndex] + radiansPerSample * pC[sampleIndex]), 0.0, PI);
Which can be muted into a serial dependency such as:
phase[0] += (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[1] += (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1]) + (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[2] += (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2]) + (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1])
phase[3] += (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3]) + (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2])
phase[4] += (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4]) + (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3])
phase[5] += (radiansPerSampleBp0 * pB[5] + radiansPerSample * pC[5]) + (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4])
Hence, the code I did:
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))
v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);
v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
}
mPhase = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
But, unfortunately, after sum "steps", the results become very different for each phase value.
Tried to debug, but I'm not really able to find where the problem is.
Also, it's not really so "fast" rather than the old version.
Are you able to recognize the trouble? And how you will speed-up the code?
Here's the whole code, if you want to check the two different outputs:
#include <iostream>
#include <algorithm>
#include <immintrin.h>
#include <emmintrin.h>
#define PI 3.14159265358979323846
constexpr int voiceSize = 1;
constexpr int bufferSize = 256;
class Param
{
public:
alignas(16) double mPhase = 0.0;
alignas(16) double mPhaseOptimized = 0.0;
alignas(16) double mNoteFrequency = 10.0;
alignas(16) double mHostPitch = 1.0;
alignas(16) double mRadiansPerSample = 1.0;
alignas(16) double b[voiceSize][bufferSize];
alignas(16) double c[voiceSize][bufferSize];
Param() { }
inline void Process(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))
phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
std::cout << sampleIndex << ": " << phase << std::endl;
}
mPhase = phase;
}
inline void ProcessOptimized(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhaseOptimized;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))
v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);
v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
std::cout << sampleIndex << ": " << v_phase.m128d_f64[0] << std::endl;
std::cout << sampleIndex + 1 << ": " << v_phase.m128d_f64[1] << std::endl;
}
mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
}
};
class MyPlugin
{
public:
Param mParam1;
MyPlugin() {
// fill b
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = (sampleIndex / ((double)bufferSize - 1));
mParam1.b[voiceIndex][sampleIndex] = value;
}
}
// fill c
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = 0.0;
mParam1.c[voiceIndex][sampleIndex] = value;
}
}
}
~MyPlugin() { }
void Process(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.Process(voiceIndex, blockSize);
}
}
void ProcessOptimized(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.ProcessOptimized(voiceIndex, blockSize);
}
}
};
int main() {
MyPlugin myPlugin;
long long numProcessing = 1;
long long counterProcessing = 0;
// I'll only process once block, just for analysis
while (counterProcessing++ < numProcessing) {
// variable blockSize (i.e. it can vary, being even or odd)
int blockSize = 256;
// process data
myPlugin.Process(blockSize);
std::cout << "#########" << std::endl;
myPlugin.ProcessOptimized(blockSize);
}
}
c++ vectorization intrinsics sse2
I'm trying to convert this code:
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))
phase += std::clamp(radiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
}
mPhase = phase;
in SSE2, trying to speed up the whole block (which is called often). I'm using MSVC with the Fast optimizazion flag, but auto-vectorization is very crap. Since I'm also learning vectorization, I find it a nice challenge.
So I've take the formula above, and simplified, such as:
radiansPerSampleBp0 = radiansPerSample * bp0;
phase += std::clamp(radiansPerSampleBp0 * pB[sampleIndex] + radiansPerSample * pC[sampleIndex]), 0.0, PI);
Which can be muted into a serial dependency such as:
phase[0] += (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[1] += (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1]) + (radiansPerSampleBp0 * pB[0] + radiansPerSample * pC[0])
phase[2] += (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2]) + (radiansPerSampleBp0 * pB[1] + radiansPerSample * pC[1])
phase[3] += (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3]) + (radiansPerSampleBp0 * pB[2] + radiansPerSample * pC[2])
phase[4] += (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4]) + (radiansPerSampleBp0 * pB[3] + radiansPerSample * pC[3])
phase[5] += (radiansPerSampleBp0 * pB[5] + radiansPerSample * pC[5]) + (radiansPerSampleBp0 * pB[4] + radiansPerSample * pC[4])
Hence, the code I did:
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))
v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);
v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
}
mPhase = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
But, unfortunately, after sum "steps", the results become very different for each phase value.
Tried to debug, but I'm not really able to find where the problem is.
Also, it's not really so "fast" rather than the old version.
Are you able to recognize the trouble? And how you will speed-up the code?
Here's the whole code, if you want to check the two different outputs:
#include <iostream>
#include <algorithm>
#include <immintrin.h>
#include <emmintrin.h>
#define PI 3.14159265358979323846
constexpr int voiceSize = 1;
constexpr int bufferSize = 256;
class Param
{
public:
alignas(16) double mPhase = 0.0;
alignas(16) double mPhaseOptimized = 0.0;
alignas(16) double mNoteFrequency = 10.0;
alignas(16) double mHostPitch = 1.0;
alignas(16) double mRadiansPerSample = 1.0;
alignas(16) double b[voiceSize][bufferSize];
alignas(16) double c[voiceSize][bufferSize];
Param() { }
inline void Process(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhase;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase, like sin(phase))
phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
std::cout << sampleIndex << ": " << phase << std::endl;
}
mPhase = phase;
}
inline void ProcessOptimized(int voiceIndex, int blockSize) {
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double phase = mPhaseOptimized;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_pB0 = _mm_load_pd(pB);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
__m128d v_pC0 = _mm_load_pd(pC);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
__m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
__m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_phaseAcc;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
// some other code (that will use phase, like sin(phase))
v_phaseAcc = _mm_add_pd(v_pB0, v_pC0);
v_phaseAcc = _mm_max_pd(v_phaseAcc, v_boundLower);
v_phaseAcc = _mm_min_pd(v_phaseAcc, v_boundUpper);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pB1);
v_phaseAcc = _mm_add_pd(v_phaseAcc, v_pC1);
v_phase = _mm_add_pd(v_phase, v_phaseAcc);
v_pB0 = _mm_load_pd(pB + 2);
v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
v_pC0 = _mm_load_pd(pC + 2);
v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
v_pB1 = _mm_load_pd(pB + 1);
v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
v_pC1 = _mm_load_pd(pC + 1);
v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
std::cout << sampleIndex << ": " << v_phase.m128d_f64[0] << std::endl;
std::cout << sampleIndex + 1 << ": " << v_phase.m128d_f64[1] << std::endl;
}
mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
}
};
class MyPlugin
{
public:
Param mParam1;
MyPlugin() {
// fill b
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = (sampleIndex / ((double)bufferSize - 1));
mParam1.b[voiceIndex][sampleIndex] = value;
}
}
// fill c
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
double value = 0.0;
mParam1.c[voiceIndex][sampleIndex] = value;
}
}
}
~MyPlugin() { }
void Process(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.Process(voiceIndex, blockSize);
}
}
void ProcessOptimized(int blockSize) {
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
mParam1.ProcessOptimized(voiceIndex, blockSize);
}
}
};
int main() {
MyPlugin myPlugin;
long long numProcessing = 1;
long long counterProcessing = 0;
// I'll only process once block, just for analysis
while (counterProcessing++ < numProcessing) {
// variable blockSize (i.e. it can vary, being even or odd)
int blockSize = 256;
// process data
myPlugin.Process(blockSize);
std::cout << "#########" << std::endl;
myPlugin.ProcessOptimized(blockSize);
}
}
c++ vectorization intrinsics sse2
c++ vectorization intrinsics sse2
edited Jan 3 at 13:26
markzzz
asked Jan 3 at 11:32
markzzzmarkzzz
19.2k94234408
19.2k94234408
Comments are not for extended discussion; this conversation has been moved to chat.
– Samuel Liew♦
Jan 3 at 23:12
add a comment |
Comments are not for extended discussion; this conversation has been moved to chat.
– Samuel Liew♦
Jan 3 at 23:12
Comments are not for extended discussion; this conversation has been moved to chat.
– Samuel Liew♦
Jan 3 at 23:12
Comments are not for extended discussion; this conversation has been moved to chat.
– Samuel Liew♦
Jan 3 at 23:12
add a comment |
1 Answer
1
active
oldest
votes
(update: this answer was written before the edits that show v_phase
being used inside the loop.)
Wait a minute, I thought in your previous question you needed the value of phase
at each step. Yeah, there was a // some other code (that will use phase)
comment inside the loop.
But this looks like you're only interested in the final value. So you're free to reorder things because the clamping for each step is independent.
This is just a reduction (like sum of an array) with some processing on the fly to generate the inputs to the reduction.
You want the 2 elements of v_phase
to be 2 independent partial sums for the even / odd elements. Then you horizontal sum it at the end. (e.g. _mm_unpackhi_pd(v_phase, v_phase)
to bring the high half to the bottom, or see Fastest way to do horizontal float vector sum on x86).
Then optionally use scalar fmod
on the result to range-reduce into the [0..2Pi)
range. (Occasional range-reduction during the sum could help precision by stopping the value from getting so large, if it turns out that precision becomes a problem.)
If that isn't the case, and you do need a vector of { phase[i+0], phase[i+1] }
for something at every i+=2
step, then your problem seems to be related to a prefix sum. But with only 2 elements per vector, just redundantly doing everything to elements with unaligned loads probably makes sense.
There might be less savings than I thought since you need to clamp each step separately: doing pB[i+0] + pB[i+1]
before multiplying could result in different clamping.
But you've apparently removed the clamping in our simplified formula, so you can potentially add elements before applying the mul/add formula.
Or maybe it's a win to do the multiply/add stuff for two steps at once, then shuffle that around to get the right stuff added.
No, I'm not interested in the final value, but at the value at each iteration, since I would use it within a sin function (I didn't place that comment here, but I would put other code into that loop, once I've done the phase calculation; now I've add the comment). Note that I also need to "store" the phase at each iteration because the next iteration need to start from the one I've calculated before.
– markzzz
Jan 3 at 13:00
"If that isn't the case, and you do need a vector of { phase[i+0], phase[i+1] } for something at every i+=2 step, then your problem seems to be related to a prefix sum" I think that's the problem. But not sure what you mean with "prefix" sum.
– markzzz
Jan 3 at 13:01
@markzzz: your manually vectorized function only doesmPhase = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
at the end of the loop, so I assumed I'd missed something and that's all you needed.
– Peter Cordes
Jan 3 at 13:04
@markzzz: en.wikipedia.org/wiki/Prefix_sum is a (somewhat) well-known problem in computing. See also parallel prefix (cumulative) sum with SSE and SIMD prefix sum on Intel cpu. With vectors of 4xfloat
, there's speedup to be had from shuffling for horizontal sums. But with only 2 elements per vector because you're usingdouble
(for audio??) there's probably no speedup. OTOH, doing the add/mul/clamp isn't free, so shuffling that output could be a win.
– Peter Cordes
Jan 3 at 13:08
I've added the whole example, if you would like to check it out ;) That would make things clear, I hope!
– markzzz
Jan 3 at 13:11
|
show 2 more comments
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54021454%2fwhats-wrong-in-this-sse2-transposition%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
(update: this answer was written before the edits that show v_phase
being used inside the loop.)
Wait a minute, I thought in your previous question you needed the value of phase
at each step. Yeah, there was a // some other code (that will use phase)
comment inside the loop.
But this looks like you're only interested in the final value. So you're free to reorder things because the clamping for each step is independent.
This is just a reduction (like sum of an array) with some processing on the fly to generate the inputs to the reduction.
You want the 2 elements of v_phase
to be 2 independent partial sums for the even / odd elements. Then you horizontal sum it at the end. (e.g. _mm_unpackhi_pd(v_phase, v_phase)
to bring the high half to the bottom, or see Fastest way to do horizontal float vector sum on x86).
Then optionally use scalar fmod
on the result to range-reduce into the [0..2Pi)
range. (Occasional range-reduction during the sum could help precision by stopping the value from getting so large, if it turns out that precision becomes a problem.)
If that isn't the case, and you do need a vector of { phase[i+0], phase[i+1] }
for something at every i+=2
step, then your problem seems to be related to a prefix sum. But with only 2 elements per vector, just redundantly doing everything to elements with unaligned loads probably makes sense.
There might be less savings than I thought since you need to clamp each step separately: doing pB[i+0] + pB[i+1]
before multiplying could result in different clamping.
But you've apparently removed the clamping in our simplified formula, so you can potentially add elements before applying the mul/add formula.
Or maybe it's a win to do the multiply/add stuff for two steps at once, then shuffle that around to get the right stuff added.
No, I'm not interested in the final value, but at the value at each iteration, since I would use it within a sin function (I didn't place that comment here, but I would put other code into that loop, once I've done the phase calculation; now I've add the comment). Note that I also need to "store" the phase at each iteration because the next iteration need to start from the one I've calculated before.
– markzzz
Jan 3 at 13:00
"If that isn't the case, and you do need a vector of { phase[i+0], phase[i+1] } for something at every i+=2 step, then your problem seems to be related to a prefix sum" I think that's the problem. But not sure what you mean with "prefix" sum.
– markzzz
Jan 3 at 13:01
@markzzz: your manually vectorized function only doesmPhase = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
at the end of the loop, so I assumed I'd missed something and that's all you needed.
– Peter Cordes
Jan 3 at 13:04
@markzzz: en.wikipedia.org/wiki/Prefix_sum is a (somewhat) well-known problem in computing. See also parallel prefix (cumulative) sum with SSE and SIMD prefix sum on Intel cpu. With vectors of 4xfloat
, there's speedup to be had from shuffling for horizontal sums. But with only 2 elements per vector because you're usingdouble
(for audio??) there's probably no speedup. OTOH, doing the add/mul/clamp isn't free, so shuffling that output could be a win.
– Peter Cordes
Jan 3 at 13:08
I've added the whole example, if you would like to check it out ;) That would make things clear, I hope!
– markzzz
Jan 3 at 13:11
|
show 2 more comments
(update: this answer was written before the edits that show v_phase
being used inside the loop.)
Wait a minute, I thought in your previous question you needed the value of phase
at each step. Yeah, there was a // some other code (that will use phase)
comment inside the loop.
But this looks like you're only interested in the final value. So you're free to reorder things because the clamping for each step is independent.
This is just a reduction (like sum of an array) with some processing on the fly to generate the inputs to the reduction.
You want the 2 elements of v_phase
to be 2 independent partial sums for the even / odd elements. Then you horizontal sum it at the end. (e.g. _mm_unpackhi_pd(v_phase, v_phase)
to bring the high half to the bottom, or see Fastest way to do horizontal float vector sum on x86).
Then optionally use scalar fmod
on the result to range-reduce into the [0..2Pi)
range. (Occasional range-reduction during the sum could help precision by stopping the value from getting so large, if it turns out that precision becomes a problem.)
If that isn't the case, and you do need a vector of { phase[i+0], phase[i+1] }
for something at every i+=2
step, then your problem seems to be related to a prefix sum. But with only 2 elements per vector, just redundantly doing everything to elements with unaligned loads probably makes sense.
There might be less savings than I thought since you need to clamp each step separately: doing pB[i+0] + pB[i+1]
before multiplying could result in different clamping.
But you've apparently removed the clamping in our simplified formula, so you can potentially add elements before applying the mul/add formula.
Or maybe it's a win to do the multiply/add stuff for two steps at once, then shuffle that around to get the right stuff added.
No, I'm not interested in the final value, but at the value at each iteration, since I would use it within a sin function (I didn't place that comment here, but I would put other code into that loop, once I've done the phase calculation; now I've add the comment). Note that I also need to "store" the phase at each iteration because the next iteration need to start from the one I've calculated before.
– markzzz
Jan 3 at 13:00
"If that isn't the case, and you do need a vector of { phase[i+0], phase[i+1] } for something at every i+=2 step, then your problem seems to be related to a prefix sum" I think that's the problem. But not sure what you mean with "prefix" sum.
– markzzz
Jan 3 at 13:01
@markzzz: your manually vectorized function only doesmPhase = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
at the end of the loop, so I assumed I'd missed something and that's all you needed.
– Peter Cordes
Jan 3 at 13:04
@markzzz: en.wikipedia.org/wiki/Prefix_sum is a (somewhat) well-known problem in computing. See also parallel prefix (cumulative) sum with SSE and SIMD prefix sum on Intel cpu. With vectors of 4xfloat
, there's speedup to be had from shuffling for horizontal sums. But with only 2 elements per vector because you're usingdouble
(for audio??) there's probably no speedup. OTOH, doing the add/mul/clamp isn't free, so shuffling that output could be a win.
– Peter Cordes
Jan 3 at 13:08
I've added the whole example, if you would like to check it out ;) That would make things clear, I hope!
– markzzz
Jan 3 at 13:11
|
show 2 more comments
(update: this answer was written before the edits that show v_phase
being used inside the loop.)
Wait a minute, I thought in your previous question you needed the value of phase
at each step. Yeah, there was a // some other code (that will use phase)
comment inside the loop.
But this looks like you're only interested in the final value. So you're free to reorder things because the clamping for each step is independent.
This is just a reduction (like sum of an array) with some processing on the fly to generate the inputs to the reduction.
You want the 2 elements of v_phase
to be 2 independent partial sums for the even / odd elements. Then you horizontal sum it at the end. (e.g. _mm_unpackhi_pd(v_phase, v_phase)
to bring the high half to the bottom, or see Fastest way to do horizontal float vector sum on x86).
Then optionally use scalar fmod
on the result to range-reduce into the [0..2Pi)
range. (Occasional range-reduction during the sum could help precision by stopping the value from getting so large, if it turns out that precision becomes a problem.)
If that isn't the case, and you do need a vector of { phase[i+0], phase[i+1] }
for something at every i+=2
step, then your problem seems to be related to a prefix sum. But with only 2 elements per vector, just redundantly doing everything to elements with unaligned loads probably makes sense.
There might be less savings than I thought since you need to clamp each step separately: doing pB[i+0] + pB[i+1]
before multiplying could result in different clamping.
But you've apparently removed the clamping in our simplified formula, so you can potentially add elements before applying the mul/add formula.
Or maybe it's a win to do the multiply/add stuff for two steps at once, then shuffle that around to get the right stuff added.
(update: this answer was written before the edits that show v_phase
being used inside the loop.)
Wait a minute, I thought in your previous question you needed the value of phase
at each step. Yeah, there was a // some other code (that will use phase)
comment inside the loop.
But this looks like you're only interested in the final value. So you're free to reorder things because the clamping for each step is independent.
This is just a reduction (like sum of an array) with some processing on the fly to generate the inputs to the reduction.
You want the 2 elements of v_phase
to be 2 independent partial sums for the even / odd elements. Then you horizontal sum it at the end. (e.g. _mm_unpackhi_pd(v_phase, v_phase)
to bring the high half to the bottom, or see Fastest way to do horizontal float vector sum on x86).
Then optionally use scalar fmod
on the result to range-reduce into the [0..2Pi)
range. (Occasional range-reduction during the sum could help precision by stopping the value from getting so large, if it turns out that precision becomes a problem.)
If that isn't the case, and you do need a vector of { phase[i+0], phase[i+1] }
for something at every i+=2
step, then your problem seems to be related to a prefix sum. But with only 2 elements per vector, just redundantly doing everything to elements with unaligned loads probably makes sense.
There might be less savings than I thought since you need to clamp each step separately: doing pB[i+0] + pB[i+1]
before multiplying could result in different clamping.
But you've apparently removed the clamping in our simplified formula, so you can potentially add elements before applying the mul/add formula.
Or maybe it's a win to do the multiply/add stuff for two steps at once, then shuffle that around to get the right stuff added.
edited Jan 3 at 13:32
answered Jan 3 at 12:10
Peter CordesPeter Cordes
133k18202339
133k18202339
No, I'm not interested in the final value, but at the value at each iteration, since I would use it within a sin function (I didn't place that comment here, but I would put other code into that loop, once I've done the phase calculation; now I've add the comment). Note that I also need to "store" the phase at each iteration because the next iteration need to start from the one I've calculated before.
– markzzz
Jan 3 at 13:00
"If that isn't the case, and you do need a vector of { phase[i+0], phase[i+1] } for something at every i+=2 step, then your problem seems to be related to a prefix sum" I think that's the problem. But not sure what you mean with "prefix" sum.
– markzzz
Jan 3 at 13:01
@markzzz: your manually vectorized function only doesmPhase = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
at the end of the loop, so I assumed I'd missed something and that's all you needed.
– Peter Cordes
Jan 3 at 13:04
@markzzz: en.wikipedia.org/wiki/Prefix_sum is a (somewhat) well-known problem in computing. See also parallel prefix (cumulative) sum with SSE and SIMD prefix sum on Intel cpu. With vectors of 4xfloat
, there's speedup to be had from shuffling for horizontal sums. But with only 2 elements per vector because you're usingdouble
(for audio??) there's probably no speedup. OTOH, doing the add/mul/clamp isn't free, so shuffling that output could be a win.
– Peter Cordes
Jan 3 at 13:08
I've added the whole example, if you would like to check it out ;) That would make things clear, I hope!
– markzzz
Jan 3 at 13:11
|
show 2 more comments
No, I'm not interested in the final value, but at the value at each iteration, since I would use it within a sin function (I didn't place that comment here, but I would put other code into that loop, once I've done the phase calculation; now I've add the comment). Note that I also need to "store" the phase at each iteration because the next iteration need to start from the one I've calculated before.
– markzzz
Jan 3 at 13:00
"If that isn't the case, and you do need a vector of { phase[i+0], phase[i+1] } for something at every i+=2 step, then your problem seems to be related to a prefix sum" I think that's the problem. But not sure what you mean with "prefix" sum.
– markzzz
Jan 3 at 13:01
@markzzz: your manually vectorized function only doesmPhase = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
at the end of the loop, so I assumed I'd missed something and that's all you needed.
– Peter Cordes
Jan 3 at 13:04
@markzzz: en.wikipedia.org/wiki/Prefix_sum is a (somewhat) well-known problem in computing. See also parallel prefix (cumulative) sum with SSE and SIMD prefix sum on Intel cpu. With vectors of 4xfloat
, there's speedup to be had from shuffling for horizontal sums. But with only 2 elements per vector because you're usingdouble
(for audio??) there's probably no speedup. OTOH, doing the add/mul/clamp isn't free, so shuffling that output could be a win.
– Peter Cordes
Jan 3 at 13:08
I've added the whole example, if you would like to check it out ;) That would make things clear, I hope!
– markzzz
Jan 3 at 13:11
No, I'm not interested in the final value, but at the value at each iteration, since I would use it within a sin function (I didn't place that comment here, but I would put other code into that loop, once I've done the phase calculation; now I've add the comment). Note that I also need to "store" the phase at each iteration because the next iteration need to start from the one I've calculated before.
– markzzz
Jan 3 at 13:00
No, I'm not interested in the final value, but at the value at each iteration, since I would use it within a sin function (I didn't place that comment here, but I would put other code into that loop, once I've done the phase calculation; now I've add the comment). Note that I also need to "store" the phase at each iteration because the next iteration need to start from the one I've calculated before.
– markzzz
Jan 3 at 13:00
"If that isn't the case, and you do need a vector of { phase[i+0], phase[i+1] } for something at every i+=2 step, then your problem seems to be related to a prefix sum" I think that's the problem. But not sure what you mean with "prefix" sum.
– markzzz
Jan 3 at 13:01
"If that isn't the case, and you do need a vector of { phase[i+0], phase[i+1] } for something at every i+=2 step, then your problem seems to be related to a prefix sum" I think that's the problem. But not sure what you mean with "prefix" sum.
– markzzz
Jan 3 at 13:01
@markzzz: your manually vectorized function only does
mPhase = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
at the end of the loop, so I assumed I'd missed something and that's all you needed.– Peter Cordes
Jan 3 at 13:04
@markzzz: your manually vectorized function only does
mPhase = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
at the end of the loop, so I assumed I'd missed something and that's all you needed.– Peter Cordes
Jan 3 at 13:04
@markzzz: en.wikipedia.org/wiki/Prefix_sum is a (somewhat) well-known problem in computing. See also parallel prefix (cumulative) sum with SSE and SIMD prefix sum on Intel cpu. With vectors of 4x
float
, there's speedup to be had from shuffling for horizontal sums. But with only 2 elements per vector because you're using double
(for audio??) there's probably no speedup. OTOH, doing the add/mul/clamp isn't free, so shuffling that output could be a win.– Peter Cordes
Jan 3 at 13:08
@markzzz: en.wikipedia.org/wiki/Prefix_sum is a (somewhat) well-known problem in computing. See also parallel prefix (cumulative) sum with SSE and SIMD prefix sum on Intel cpu. With vectors of 4x
float
, there's speedup to be had from shuffling for horizontal sums. But with only 2 elements per vector because you're using double
(for audio??) there's probably no speedup. OTOH, doing the add/mul/clamp isn't free, so shuffling that output could be a win.– Peter Cordes
Jan 3 at 13:08
I've added the whole example, if you would like to check it out ;) That would make things clear, I hope!
– markzzz
Jan 3 at 13:11
I've added the whole example, if you would like to check it out ;) That would make things clear, I hope!
– markzzz
Jan 3 at 13:11
|
show 2 more comments
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54021454%2fwhats-wrong-in-this-sse2-transposition%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Comments are not for extended discussion; this conversation has been moved to chat.
– Samuel Liew♦
Jan 3 at 23:12