Program Listing for File AVX_math.h

Return to documentation for file (SPlisHSPlasH/Utilities/AVX_math.h)

#ifndef AVX_MATH_H
#define AVX_MATH_H

#include <ctime>
#include <iomanip>
#include <vector>
#include <memory>

#include "SPlisHSPlasH/Common.h"

#ifndef __APPLE__
#include <immintrin.h>

// ----------------------------------------------------------------------------------------------
//vector of 8 float values to represent 8 scalars
class Scalarf8
{
public:
    __m256 v;

    Scalarf8() {}

    Scalarf8(float f) { v = _mm256_set1_ps(f); }

//  Scalarf8(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7) {
//      v = _mm256_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7);
//  }

    Scalarf8(Real f0, Real f1, Real f2, Real f3, Real f4, Real f5, Real f6, Real f7)
    {
        v = _mm256_setr_ps((float)f0, (float)f1, (float)f2, (float)f3,
                           (float)f4, (float)f5, (float)f6, (float)f7);
    }

    Scalarf8(float const * p)
    {
        v = _mm256_loadu_ps(p);
    }

    Scalarf8(__m256 const & x) {
        v = x;
    }

    inline void setZero() { v = _mm256_setzero_ps(); }

    Scalarf8 & operator = (__m256 const & x) {
        v = x;
        return *this;
    }

    inline Scalarf8 sqrt() const
    {
        return _mm256_sqrt_ps(v);
    }

    inline Scalarf8 rsqrt() const
    {
        return _mm256_rsqrt_ps(v);
    }

    Scalarf8 & load(float const * p) {
        v = _mm256_loadu_ps(p);
        return *this;
    }

    void store(float * p) const {
        _mm256_storeu_ps(p, v);
    }

    inline float reduce() const {
        /* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */
        const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(v, 1), _mm256_castps256_ps128(v));
        /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */
        const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
        /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */
        const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
        /* Conversion to float is a no-op on x86-64 */
        return _mm_cvtss_f32(x32);
    }
};

inline Scalarf8 operator - (Scalarf8& a) {
    return _mm256_sub_ps(_mm256_set1_ps(0.0), a.v);
}

static inline Scalarf8 multiplyAndAdd(const Scalarf8& a, const Scalarf8& b, const Scalarf8& c)
{
    return _mm256_fmadd_ps(a.v, b.v, c.v);
}

static inline Scalarf8 multiplyAndSubtract(const Scalarf8& a, const Scalarf8& b, const Scalarf8& c)
{
    return _mm256_fmsub_ps(a.v, b.v, c.v);
}

static inline Scalarf8 operator + (Scalarf8 const & a, Scalarf8 const & b) {
    return _mm256_add_ps(a.v, b.v);
}

static inline Scalarf8 & operator += (Scalarf8 & a, Scalarf8 const & b) {
    a.v = _mm256_add_ps(a.v, b.v);
    return a;
}

static inline Scalarf8 operator - (Scalarf8 const & a, Scalarf8 const & b) {
    return _mm256_sub_ps(a.v, b.v);
}

static inline Scalarf8 & operator -= (Scalarf8 & a, Scalarf8 const & b) {
    a.v = _mm256_sub_ps(a.v, b.v);
    return a;
}

static inline Scalarf8 operator * (Scalarf8 const & a, Scalarf8 const & b) {
    return _mm256_mul_ps(a.v, b.v);
}

static inline Scalarf8 & operator *= (Scalarf8 & a, Scalarf8 const & b) {
    a.v = _mm256_mul_ps(a.v, b.v);
    return a;
}

static inline Scalarf8 operator / (Scalarf8 const & a, Scalarf8 const & b) {
    return _mm256_div_ps(a.v, b.v);
}

static inline Scalarf8 & operator /= (Scalarf8 & a, Scalarf8 const & b) {
    a.v = _mm256_div_ps(a.v, b.v);
    return a;
}

static inline Scalarf8 operator == (Scalarf8 const & a, Scalarf8 const & b) {
    return _mm256_cmp_ps(a.v, b.v, 0);
}

static inline Scalarf8 operator != (Scalarf8 const & a, Scalarf8 const & b) {
    return _mm256_cmp_ps(a.v, b.v, 4);
}

static inline Scalarf8 operator < (Scalarf8 const & a, Scalarf8 const & b) {
    return _mm256_cmp_ps(a.v, b.v, 1);
}

static inline Scalarf8 operator <= (Scalarf8 const & a, Scalarf8 const & b) {
    return _mm256_cmp_ps(a.v, b.v, 2);
}

static inline Scalarf8 operator > (Scalarf8 const & a, Scalarf8 const & b) {
    return _mm256_cmp_ps(b.v, a.v, 1);
}

static inline Scalarf8 operator >= (Scalarf8 const & a, Scalarf8 const & b) {
    return _mm256_cmp_ps(b.v, a.v, 2);
}

static inline Scalarf8 min(Scalarf8 const & a, Scalarf8 const & b) {
    return _mm256_min_ps(a.v, b.v);
}

static inline Scalarf8 max(Scalarf8 const & a, Scalarf8 const & b) {
    return _mm256_max_ps(a.v, b.v);
}

template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
static inline __m256 constant8f() {
    static const union {
        int     i[8];
        __m256  ymm;
    } u = { { i0,i1,i2,i3,i4,i5,i6,i7 } };
    return u.ymm;
}

static inline Scalarf8 abs(Scalarf8 const & a) {
    __m256 mask = constant8f<0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF>();
    return _mm256_and_ps(a.v, mask);
}

//does the same as for (int i = 0; i < 8; i++) result[i] = c[i] ? a[i] : b[i];
//the elemets in c must be either 0 (false) or 0xFFFFFFFF (true)
static inline Scalarf8 blend(Scalarf8 const & c, Scalarf8 const & a, Scalarf8 const & b) {
    return _mm256_blendv_ps(b.v, a.v, c.v);
}


static inline Scalarf8 convert_zero(const unsigned int *idx, const Real *x, const unsigned char count = 8u)
{
    Scalarf8 v;
    switch (count)
    {
    case 1u:
        v.v = _mm256_setr_ps(x[idx[0]], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f); break;
    case 2u:
        v.v = _mm256_setr_ps(x[idx[0]], x[idx[1]], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f); break;
    case 3u:
        v.v = _mm256_setr_ps(x[idx[0]], x[idx[1]], x[idx[2]], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f); break;
    case 4u:
        v.v = _mm256_setr_ps(x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], 0.0f, 0.0f, 0.0f, 0.0f); break;
    case 5u:
        v.v = _mm256_setr_ps(x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], x[idx[4]], 0.0f, 0.0f, 0.0f); break;
    case 6u:
        v.v = _mm256_setr_ps(x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], x[idx[4]], x[idx[5]], 0.0f, 0.0f); break;
    case 7u:
        v.v = _mm256_setr_ps(x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], x[idx[4]], x[idx[5]], x[idx[6]], 0.0f); break;
    case 8u:
        v.v = _mm256_setr_ps(x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], x[idx[4]], x[idx[5]], x[idx[6]], x[idx[7]]); break;
    }
    return v;
}

static inline Scalarf8 convert_zero(const Real x, const unsigned char count = 8u)
{
    Scalarf8 v;
    switch (count)
    {
    case 1u:
        v.v = _mm256_setr_ps(x, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f); break;
    case 2u:
        v.v = _mm256_setr_ps(x, x, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f); break;
    case 3u:
        v.v = _mm256_setr_ps(x, x, x, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f); break;
    case 4u:
        v.v = _mm256_setr_ps(x, x, x, x, 0.0f, 0.0f, 0.0f, 0.0f); break;
    case 5u:
        v.v = _mm256_setr_ps(x, x, x, x, x, 0.0f, 0.0f, 0.0f); break;
    case 6u:
        v.v = _mm256_setr_ps(x, x, x, x, x, x, 0.0f, 0.0f); break;
    case 7u:
        v.v = _mm256_setr_ps(x, x, x, x, x, x, x, 0.0f); break;
    case 8u:
        v.v = _mm256_setr_ps(x, x, x, x, x, x, x, x); break;
    }
    return v;
}

static inline Scalarf8 convert_one(const unsigned int *idx, const Real *x, const unsigned char count = 8u)
{
    Scalarf8 v;
    switch (count)
    {
    case 1u:
        v.v = _mm256_setr_ps(x[idx[0]], 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f); break;
    case 2u:
        v.v = _mm256_setr_ps(x[idx[0]], x[idx[1]], 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f); break;
    case 3u:
        v.v = _mm256_setr_ps(x[idx[0]], x[idx[1]], x[idx[2]], 1.0f, 1.0f, 1.0f, 1.0f, 1.0f); break;
    case 4u:
        v.v = _mm256_setr_ps(x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], 1.0f, 1.0f, 1.0f, 1.0f); break;
    case 5u:
        v.v = _mm256_setr_ps(x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], x[idx[4]], 1.0f, 1.0f, 1.0f); break;
    case 6u:
        v.v = _mm256_setr_ps(x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], x[idx[4]], x[idx[5]], 1.0f, 1.0f); break;
    case 7u:
        v.v = _mm256_setr_ps(x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], x[idx[4]], x[idx[5]], x[idx[6]], 1.0f); break;
    case 8u:
        v.v = _mm256_setr_ps(x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], x[idx[4]], x[idx[5]], x[idx[6]], x[idx[7]]); break;
    }
    return v;
}

// ----------------------------------------------------------------------------------------------
//3 dimensional vector of Scalar8f to represent 8 3d vectors
class Vector3f8
{
public:

    Scalarf8 v[3];

    Vector3f8() {}
    Vector3f8(const bool ) { v[0] = Scalarf8(0.0f); v[1] = Scalarf8(0.0f); v[2] = Scalarf8(0.0f); }
    Vector3f8(const Scalarf8 &x, const Scalarf8 &y, const Scalarf8 &z) { v[0] = x; v[1] = y; v[2] = z; }
    Vector3f8(const Scalarf8 &x) { v[0] = v[1] = v[2] = x; }
    Vector3f8(const Vector3f &x)
    {
        v[0].v = _mm256_set1_ps(x[0]);
        v[1].v = _mm256_set1_ps(x[1]);
        v[2].v = _mm256_set1_ps(x[2]);
    }
    Vector3f8(const Vector3f &v0, const Vector3f &v1, const Vector3f &v2, const Vector3f &v3, const Vector3f &v4, const Vector3f &v5, const Vector3f &v6, const Vector3f &v7)
    {
        Scalarf8 x(v0[0], v1[0], v2[0], v3[0], v4[0], v5[0], v6[0], v7[0]);
        Scalarf8 y(v0[1], v1[1], v2[1], v3[1], v4[1], v5[1], v6[1], v7[1]);
        Scalarf8 z(v0[2], v1[2], v2[2], v3[2], v4[2], v5[2], v6[2], v7[2]);
        v[0] = x; v[1] = y; v[2] = z;
    }
    Vector3f8(Vector3f const *x)
    {
        Scalarf8 vx(x[0][0], x[1][0], x[2][0], x[3][0], x[4][0], x[5][0], x[6][0], x[7][0]);
        Scalarf8 vy(x[0][1], x[1][1], x[2][1], x[3][1], x[4][1], x[5][1], x[6][1], x[7][1]);
        Scalarf8 vz(x[0][2], x[1][2], x[2][2], x[3][2], x[4][2], x[5][2], x[6][2], x[7][2]);
        v[0] = vx; v[1] = vy; v[2] = vz;
    }

    inline void setZero() { v[0].v = _mm256_setzero_ps(); v[1].v = _mm256_setzero_ps(); v[2].v = _mm256_setzero_ps();
    }

    inline Scalarf8& operator [] (int i) { return v[i]; }
    inline const Scalarf8& operator [] (int i) const { return v[i]; }

    inline Scalarf8& x() { return v[0]; }
    inline Scalarf8& y() { return v[1]; }
    inline Scalarf8& z() { return v[2]; }

    inline const Scalarf8& x() const { return v[0]; }
    inline const Scalarf8& y() const { return v[1]; }
    inline const Scalarf8& z() const { return v[2]; }

    inline Scalarf8 dot(const Vector3f8& a) const {
        Scalarf8 res;
        //res.v = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(v[0].v, a.v[0].v), _mm256_mul_ps(v[1].v, a.v[1].v)), _mm256_mul_ps(v[2].v, a.v[2].v));
        res.v = _mm256_fmadd_ps(v[0].v, a.v[0].v, _mm256_fmadd_ps(v[1].v, a.v[1].v, _mm256_mul_ps(v[2].v, a.v[2].v)));
        return res;
    }

    //dot product
    inline Scalarf8 operator * (const Vector3f8& a) const {
        Scalarf8 res;
        //res.v = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(v[0].v, a.v[0].v), _mm256_mul_ps(v[1].v, a.v[1].v)), _mm256_mul_ps(v[2].v, a.v[2].v));
        res.v = _mm256_fmadd_ps(v[0].v, a.v[0].v, _mm256_fmadd_ps(v[1].v, a.v[1].v, _mm256_mul_ps(v[2].v, a.v[2].v)));
        return res;
    }

    inline void cross(const Vector3f8& a, const Vector3f8& b) {
        v[0] = a.v[1] * b.v[2] - a.v[2] * b.v[1];
        v[1] = a.v[2] * b.v[0] - a.v[0] * b.v[2];
        v[2] = a.v[0] * b.v[1] - a.v[1] * b.v[0];
    }

    //cross product
    inline const Vector3f8 operator % (const Vector3f8& a) const {
        return Vector3f8(v[1] * a.v[2] - v[2] * a.v[1],
            v[2] * a.v[0] - v[0] * a.v[2],
            v[0] * a.v[1] - v[1] * a.v[0]);
    }

    inline Vector3f8& operator *= (const Scalarf8 &s) {
        v[0] *= s;
        v[1] *= s;
        v[2] *= s;
        return *this;
    }

    inline const Vector3f8 operator / (const Scalarf8 &s) const {
        return Vector3f8(v[0] / s, v[1] / s, v[2] / s);
    }

    inline Vector3f8& operator /= (const Scalarf8 &s) {
        v[0] = v[0] / s;
        v[1] = v[1] / s;
        v[2] = v[2] / s;
        return *this;
    }

    inline const Vector3f8 operator - () const {
        return Vector3f8(Scalarf8(-1.0) * v[0], Scalarf8(-1.0) * v[1], Scalarf8(-1.0) * v[2]);
    }

    inline Scalarf8 squaredNorm() const {
        //return _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(v[0].v, v[0].v), _mm256_mul_ps(v[1].v, v[1].v)), _mm256_mul_ps(v[2].v, v[2].v));
        return _mm256_fmadd_ps(v[0].v, v[0].v, _mm256_fmadd_ps(v[1].v, v[1].v, _mm256_mul_ps(v[2].v, v[2].v)));
    }

    inline Scalarf8 norm() const
    {
        //return _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(v[0].v, v[0].v), _mm256_mul_ps(v[1].v, v[1].v)), _mm256_mul_ps(v[2].v, v[2].v)));
        return _mm256_sqrt_ps(_mm256_fmadd_ps(v[0].v, v[0].v, _mm256_fmadd_ps(v[1].v, v[1].v, _mm256_mul_ps(v[2].v, v[2].v))));
    }

    inline void normalize()
    {
        const Scalarf8 norm = (v[0] * v[0] + v[1] * v[1] + v[2] * v[2]).sqrt();
        v[0] = v[0] / norm;
        v[1] = v[1] / norm;
        v[2] = v[2] / norm;
        *this = blend(norm >= Scalarf8(1e-5f), *this, Vector3f8(true));
    }

    //does the same as for (int i = 0; i < 8; i++) result[i] = c[i] ? a[i] : b[i];
    //the elements in c must be either 0 (false) or 0xFFFFFFFF (true)
    static inline Vector3f8 blend(Scalarf8 const & c, Vector3f8 const & a, Vector3f8 const & b) {
        Vector3f8 result;
        result.x() = _mm256_blendv_ps(b.x().v, a.x().v, c.v);
        result.y() = _mm256_blendv_ps(b.y().v, a.y().v, c.v);
        result.z() = _mm256_blendv_ps(b.z().v, a.z().v, c.v);
        return result;
    }

    inline void store(std::vector<Vector3r>& Vf) const
    {
        for (int i = 0; i < 3; i++)
        {
            float val[8];
            v[i].store(val);
            for (int k = 0; k < 8; k++)
                Vf[k][i] = val[k];
        }
    }

    inline void store(Vector3r* Vf) const
    {
        for (int i = 0; i < 3; i++)
        {
            float val[8];
            v[i].store(val);
            for (int k = 0; k < 8; k++)
                Vf[k][i] = val[k];
        }
    }

    inline Vector3r reduce() const {
        Vector3r res;
        res[0] = v[0].reduce();
        res[1] = v[1].reduce();
        res[2] = v[2].reduce();
        return res;
    }
};



inline Vector3f8 operator + (Vector3f8 const& a, Vector3f8 const& b) {
    Vector3f8 res;
    res.v[0].v = _mm256_add_ps(a[0].v, b[0].v);
    res.v[1].v = _mm256_add_ps(a[1].v, b[1].v);
    res.v[2].v = _mm256_add_ps(a[2].v, b[2].v);
    return res;
}

inline Vector3f8 operator - (Vector3f8 const& a, Vector3f8 const& b) {
    Vector3f8 res;
    res.v[0].v = _mm256_sub_ps(a[0].v, b[0].v);
    res.v[1].v = _mm256_sub_ps(a[1].v, b[1].v);
    res.v[2].v = _mm256_sub_ps(a[2].v, b[2].v);
    return res;
}

inline Vector3f8& operator += (Vector3f8& a, Vector3f8 const& b) {
    a[0].v = _mm256_add_ps(a[0].v, b[0].v);
    a[1].v = _mm256_add_ps(a[1].v, b[1].v);
    a[2].v = _mm256_add_ps(a[2].v, b[2].v);
    return a;
}

inline Vector3f8& operator -= (Vector3f8& a, Vector3f8 const& b) {
    a[0].v = _mm256_sub_ps(a[0].v, b[0].v);
    a[1].v = _mm256_sub_ps(a[1].v, b[1].v);
    a[2].v = _mm256_sub_ps(a[2].v, b[2].v);
    return a;
}

inline Vector3f8 operator * (Vector3f8 const& a, const Scalarf8& s) {
    Vector3f8 res;
    res.v[0].v = _mm256_mul_ps(a[0].v, s.v);
    res.v[1].v = _mm256_mul_ps(a[1].v, s.v);
    res.v[2].v = _mm256_mul_ps(a[2].v, s.v);
    return res;
}

inline Vector3f8 convertVec_zero(const unsigned int* idx, const Real* v, const unsigned char count = 8u)
{
    Vector3f8 x;
    switch (count)
    {
    case 1u:
        x.v[0].v = _mm256_setr_ps(v[3*idx[0]+0], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.v[1].v = _mm256_setr_ps(v[3*idx[0]+1], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.v[2].v = _mm256_setr_ps(v[3*idx[0]+2], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        break;
    case 2u:
        x.v[0].v = _mm256_setr_ps(v[3*idx[0]+0], v[3*idx[1]+0], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.v[1].v = _mm256_setr_ps(v[3*idx[0]+1], v[3*idx[1]+1], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.v[2].v = _mm256_setr_ps(v[3*idx[0]+2], v[3*idx[1]+2], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        break;
    case 3u:
        x.v[0].v = _mm256_setr_ps(v[3*idx[0]+0], v[3*idx[1]+0], v[3*idx[2]+0], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.v[1].v = _mm256_setr_ps(v[3*idx[0]+1], v[3*idx[1]+1], v[3*idx[2]+1], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.v[2].v = _mm256_setr_ps(v[3*idx[0]+2], v[3*idx[1]+2], v[3*idx[2]+2], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        break;
    case 4u:
        x.v[0].v = _mm256_setr_ps(v[3*idx[0]+0], v[3*idx[1]+0], v[3*idx[2]+0], v[3*idx[3]+0], 0.0f, 0.0f, 0.0f, 0.0f);
        x.v[1].v = _mm256_setr_ps(v[3*idx[0]+1], v[3*idx[1]+1], v[3*idx[2]+1], v[3*idx[3]+1], 0.0f, 0.0f, 0.0f, 0.0f);
        x.v[2].v = _mm256_setr_ps(v[3*idx[0]+2], v[3*idx[1]+2], v[3*idx[2]+2], v[3*idx[3]+2], 0.0f, 0.0f, 0.0f, 0.0f);
        break;
    case 5u:
        x.v[0].v = _mm256_setr_ps(v[3*idx[0]+0], v[3*idx[1]+0], v[3*idx[2]+0], v[3*idx[3]+0], v[3*idx[4]+0], 0.0f, 0.0f, 0.0f);
        x.v[1].v = _mm256_setr_ps(v[3*idx[0]+1], v[3*idx[1]+1], v[3*idx[2]+1], v[3*idx[3]+1], v[3*idx[4]+1], 0.0f, 0.0f, 0.0f);
        x.v[2].v = _mm256_setr_ps(v[3*idx[0]+2], v[3*idx[1]+2], v[3*idx[2]+2], v[3*idx[3]+2], v[3*idx[4]+2], 0.0f, 0.0f, 0.0f);
        break;
    case 6u:
        x.v[0].v = _mm256_setr_ps(v[3*idx[0]+0], v[3*idx[1]+0], v[3*idx[2]+0], v[3*idx[3]+0], v[3*idx[4]+0], v[3*idx[5]+0], 0.0f, 0.0f);
        x.v[1].v = _mm256_setr_ps(v[3*idx[0]+1], v[3*idx[1]+1], v[3*idx[2]+1], v[3*idx[3]+1], v[3*idx[4]+1], v[3*idx[5]+1], 0.0f, 0.0f);
        x.v[2].v = _mm256_setr_ps(v[3*idx[0]+2], v[3*idx[1]+2], v[3*idx[2]+2], v[3*idx[3]+2], v[3*idx[4]+2], v[3*idx[5]+2], 0.0f, 0.0f);
        break;
    case 7u:
        x.v[0].v = _mm256_setr_ps(v[3*idx[0]+0], v[3*idx[1]+0], v[3*idx[2]+0], v[3*idx[3]+0], v[3*idx[4]+0], v[3*idx[5]+0], v[3*idx[6]+0], 0.0f);
        x.v[1].v = _mm256_setr_ps(v[3*idx[0]+1], v[3*idx[1]+1], v[3*idx[2]+1], v[3*idx[3]+1], v[3*idx[4]+1], v[3*idx[5]+1], v[3*idx[6]+1], 0.0f);
        x.v[2].v = _mm256_setr_ps(v[3*idx[0]+2], v[3*idx[1]+2], v[3*idx[2]+2], v[3*idx[3]+2], v[3*idx[4]+2], v[3*idx[5]+2], v[3*idx[6]+2], 0.0f);
        break;
    case 8u:
        x.v[0].v = _mm256_setr_ps(v[3*idx[0]+0], v[3*idx[1]+0], v[3*idx[2]+0], v[3*idx[3]+0], v[3*idx[4]+0], v[3*idx[5]+0], v[3*idx[6]+0], v[3*idx[7]+0]);
        x.v[1].v = _mm256_setr_ps(v[3*idx[0]+1], v[3*idx[1]+1], v[3*idx[2]+1], v[3*idx[3]+1], v[3*idx[4]+1], v[3*idx[5]+1], v[3*idx[6]+1], v[3*idx[7]+1]);
        x.v[2].v = _mm256_setr_ps(v[3*idx[0]+2], v[3*idx[1]+2], v[3*idx[2]+2], v[3*idx[3]+2], v[3*idx[4]+2], v[3*idx[5]+2], v[3*idx[6]+2], v[3*idx[7]+2]);
    }
    return x;
}

static inline Vector3f8 convertVec_zero(const unsigned int *idx, const Vector3r *v, const unsigned char count = 8u)
{
    Vector3f8 x;
    switch (count)
    {
    case 1u:
        x.v[0].v = _mm256_setr_ps(v[idx[0]][0], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.v[1].v = _mm256_setr_ps(v[idx[0]][1], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.v[2].v = _mm256_setr_ps(v[idx[0]][2], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        break;
    case 2u:
        x.v[0].v = _mm256_setr_ps(v[idx[0]][0], v[idx[1]][0], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.v[1].v = _mm256_setr_ps(v[idx[0]][1], v[idx[1]][1], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.v[2].v = _mm256_setr_ps(v[idx[0]][2], v[idx[1]][2], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        break;
    case 3u:
        x.v[0].v = _mm256_setr_ps(v[idx[0]][0], v[idx[1]][0], v[idx[2]][0], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.v[1].v = _mm256_setr_ps(v[idx[0]][1], v[idx[1]][1], v[idx[2]][1], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.v[2].v = _mm256_setr_ps(v[idx[0]][2], v[idx[1]][2], v[idx[2]][2], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        break;
    case 4u:
        x.v[0].v = _mm256_setr_ps(v[idx[0]][0], v[idx[1]][0], v[idx[2]][0], v[idx[3]][0], 0.0f, 0.0f, 0.0f, 0.0f);
        x.v[1].v = _mm256_setr_ps(v[idx[0]][1], v[idx[1]][1], v[idx[2]][1], v[idx[3]][1], 0.0f, 0.0f, 0.0f, 0.0f);
        x.v[2].v = _mm256_setr_ps(v[idx[0]][2], v[idx[1]][2], v[idx[2]][2], v[idx[3]][2], 0.0f, 0.0f, 0.0f, 0.0f);
        break;
    case 5u:
        x.v[0].v = _mm256_setr_ps(v[idx[0]][0], v[idx[1]][0], v[idx[2]][0], v[idx[3]][0], v[idx[4]][0], 0.0f, 0.0f, 0.0f);
        x.v[1].v = _mm256_setr_ps(v[idx[0]][1], v[idx[1]][1], v[idx[2]][1], v[idx[3]][1], v[idx[4]][1], 0.0f, 0.0f, 0.0f);
        x.v[2].v = _mm256_setr_ps(v[idx[0]][2], v[idx[1]][2], v[idx[2]][2], v[idx[3]][2], v[idx[4]][2], 0.0f, 0.0f, 0.0f);
        break;
    case 6u:
        x.v[0].v = _mm256_setr_ps(v[idx[0]][0], v[idx[1]][0], v[idx[2]][0], v[idx[3]][0], v[idx[4]][0], v[idx[5]][0], 0.0f, 0.0f);
        x.v[1].v = _mm256_setr_ps(v[idx[0]][1], v[idx[1]][1], v[idx[2]][1], v[idx[3]][1], v[idx[4]][1], v[idx[5]][1], 0.0f, 0.0f);
        x.v[2].v = _mm256_setr_ps(v[idx[0]][2], v[idx[1]][2], v[idx[2]][2], v[idx[3]][2], v[idx[4]][2], v[idx[5]][2], 0.0f, 0.0f);
        break;
    case 7u:
        x.v[0].v = _mm256_setr_ps(v[idx[0]][0], v[idx[1]][0], v[idx[2]][0], v[idx[3]][0], v[idx[4]][0], v[idx[5]][0], v[idx[6]][0], 0.0f);
        x.v[1].v = _mm256_setr_ps(v[idx[0]][1], v[idx[1]][1], v[idx[2]][1], v[idx[3]][1], v[idx[4]][1], v[idx[5]][1], v[idx[6]][1], 0.0f);
        x.v[2].v = _mm256_setr_ps(v[idx[0]][2], v[idx[1]][2], v[idx[2]][2], v[idx[3]][2], v[idx[4]][2], v[idx[5]][2], v[idx[6]][2], 0.0f);
        break;
    case 8u:
        x.v[0].v = _mm256_setr_ps(v[idx[0]][0], v[idx[1]][0], v[idx[2]][0], v[idx[3]][0], v[idx[4]][0], v[idx[5]][0], v[idx[6]][0], v[idx[7]][0]);
        x.v[1].v = _mm256_setr_ps(v[idx[0]][1], v[idx[1]][1], v[idx[2]][1], v[idx[3]][1], v[idx[4]][1], v[idx[5]][1], v[idx[6]][1], v[idx[7]][1]);
        x.v[2].v = _mm256_setr_ps(v[idx[0]][2], v[idx[1]][2], v[idx[2]][2], v[idx[3]][2], v[idx[4]][2], v[idx[5]][2], v[idx[6]][2], v[idx[7]][2]);
    }
    return x;
}


// ----------------------------------------------------------------------------------------------
//3x3 dimensional matrix of Scalar8f to represent 8 3x3 matrices
class Matrix3f8
{
public:
    Scalarf8 m[3][3];

    Matrix3f8() {  }

    Matrix3f8(const Matrix3f &x)
    {
        const Vector3f8 m1(x.col(0));
        const Vector3f8 m2(x.col(1));
        const Vector3f8 m3(x.col(2));
        m[0][0] = m1.x();
        m[1][0] = m1.y();
        m[2][0] = m1.z();

        m[0][1] = m2.x();
        m[1][1] = m2.y();
        m[2][1] = m2.z();

        m[0][2] = m3.x();
        m[1][2] = m3.y();
        m[2][2] = m3.z();
    }

    //constructor to create matrix from 3 column vectors
    Matrix3f8(const Vector3f8& m1, const Vector3f8& m2, const Vector3f8& m3)
    {
        m[0][0] = m1.x();
        m[1][0] = m1.y();
        m[2][0] = m1.z();

        m[0][1] = m2.x();
        m[1][1] = m2.y();
        m[2][1] = m2.z();

        m[0][2] = m3.x();
        m[1][2] = m3.y();
        m[2][2] = m3.z();
    }

    inline void setZero()
    {
        m[0][0].v = _mm256_setzero_ps();
        m[1][0].v = _mm256_setzero_ps();
        m[2][0].v = _mm256_setzero_ps();

        m[0][1].v = _mm256_setzero_ps();
        m[1][1].v = _mm256_setzero_ps();
        m[2][1].v = _mm256_setzero_ps();

        m[0][2].v = _mm256_setzero_ps();
        m[1][2].v = _mm256_setzero_ps();
        m[2][2].v = _mm256_setzero_ps();
    }

    inline Scalarf8& operator()(int i, int j) { return m[i][j]; }

    inline void setCol(int i, const Vector3f8& v)
    {
        m[0][i] = v.x();
        m[1][i] = v.y();
        m[2][i] = v.z();
    }

    inline void setCol(int i, const Scalarf8& x, const Scalarf8& y, const Scalarf8& z)
    {
        m[0][i] = x;
        m[1][i] = y;
        m[2][i] = z;
    }

    inline Matrix3f8 operator * (const Scalarf8 &b) const
    {
        Matrix3f8 A;

        A.m[0][0] = m[0][0] * b;
        A.m[0][1] = m[0][1] * b;
        A.m[0][2] = m[0][2] * b;

        A.m[1][0] = m[1][0] * b;
        A.m[1][1] = m[1][1] * b;
        A.m[1][2] = m[1][2] * b;

        A.m[2][0] = m[2][0] * b;
        A.m[2][1] = m[2][1] * b;
        A.m[2][2] = m[2][2] * b;

        return A;
    }

    inline Vector3f8 operator * (const Vector3f8 &b) const
    {
        Vector3f8 A;

        A.v[0] = m[0][0] * b.v[0] + m[0][1] * b.v[1] + m[0][2] * b.v[2];
        A.v[1] = m[1][0] * b.v[0] + m[1][1] * b.v[1] + m[1][2] * b.v[2];
        A.v[2] = m[2][0] * b.v[0] + m[2][1] * b.v[1] + m[2][2] * b.v[2];

        return A;
    }

    inline Matrix3f8 operator * (const Matrix3f8 &b) const
    {
        Matrix3f8 A;

        A.m[0][0] = m[0][0] * b.m[0][0] + m[0][1] * b.m[1][0] + m[0][2] * b.m[2][0];
        A.m[0][1] = m[0][0] * b.m[0][1] + m[0][1] * b.m[1][1] + m[0][2] * b.m[2][1];
        A.m[0][2] = m[0][0] * b.m[0][2] + m[0][1] * b.m[1][2] + m[0][2] * b.m[2][2];

        A.m[1][0] = m[1][0] * b.m[0][0] + m[1][1] * b.m[1][0] + m[1][2] * b.m[2][0];
        A.m[1][1] = m[1][0] * b.m[0][1] + m[1][1] * b.m[1][1] + m[1][2] * b.m[2][1];
        A.m[1][2] = m[1][0] * b.m[0][2] + m[1][1] * b.m[1][2] + m[1][2] * b.m[2][2];

        A.m[2][0] = m[2][0] * b.m[0][0] + m[2][1] * b.m[1][0] + m[2][2] * b.m[2][0];
        A.m[2][1] = m[2][0] * b.m[0][1] + m[2][1] * b.m[1][1] + m[2][2] * b.m[2][1];
        A.m[2][2] = m[2][0] * b.m[0][2] + m[2][1] * b.m[1][2] + m[2][2] * b.m[2][2];

        return A;
    }

    inline Matrix3f8& operator += (const Matrix3f8& a) {
        m[0][0] += a.m[0][0];
        m[1][0] += a.m[1][0];
        m[2][0] += a.m[2][0];

        m[0][1] += a.m[0][1];
        m[1][1] += a.m[1][1];
        m[2][1] += a.m[2][1];

        m[0][2] += a.m[0][2];
        m[1][2] += a.m[1][2];
        m[2][2] += a.m[2][2];
        return *this;
    }

    inline Matrix3f8 transpose() const
    {
        Matrix3f8 A;
        A.m[0][0] = m[0][0]; A.m[0][1] = m[1][0]; A.m[0][2] = m[2][0];
        A.m[1][0] = m[0][1]; A.m[1][1] = m[1][1]; A.m[1][2] = m[2][1];
        A.m[2][0] = m[0][2]; A.m[2][1] = m[1][2]; A.m[2][2] = m[2][2];

        return A;
    }

    inline Scalarf8 determinant() const
    {
        return  m[0][1] * m[1][2] * m[2][0] - m[0][2] * m[1][1] * m[2][0] + m[0][2] * m[1][0] * m[2][1]
              - m[0][0] * m[1][2] * m[2][1] - m[0][1] * m[1][0] * m[2][2] + m[0][0] * m[1][1] * m[2][2];
    }

    inline void store(std::vector<Matrix3r>& Mf) const
    {
        for (int i = 0; i < 3; i++)
        {
            for (int j = 0; j < 3; j++)
            {
                float val[8];
                m[i][j].store(val);
                for (int k = 0; k < 8; k++)
                    Mf[k](i, j) = val[k];
            }
        }
    }

    inline Matrix3r reduce() const {
        Matrix3r A;
        A(0, 0) = m[0][0].reduce();
        A(0, 1) = m[0][1].reduce();
        A(0, 2) = m[0][2].reduce();

        A(1, 0) = m[1][0].reduce();
        A(1, 1) = m[1][1].reduce();
        A(1, 2) = m[1][2].reduce();

        A(2, 0) = m[2][0].reduce();
        A(2, 1) = m[2][1].reduce();
        A(2, 2) = m[2][2].reduce();
        return A;
    }
};


inline Matrix3f8& operator -= (Matrix3f8& a, Matrix3f8 const& b) {
    a.m[0][0].v = _mm256_sub_ps(a.m[0][0].v, b.m[0][0].v);
    a.m[0][1].v = _mm256_sub_ps(a.m[0][1].v, b.m[0][1].v);
    a.m[0][2].v = _mm256_sub_ps(a.m[0][2].v, b.m[0][2].v);

    a.m[1][0].v = _mm256_sub_ps(a.m[1][0].v, b.m[1][0].v);
    a.m[1][1].v = _mm256_sub_ps(a.m[1][1].v, b.m[1][1].v);
    a.m[1][2].v = _mm256_sub_ps(a.m[1][2].v, b.m[1][2].v);

    a.m[2][0].v = _mm256_sub_ps(a.m[2][0].v, b.m[2][0].v);
    a.m[2][1].v = _mm256_sub_ps(a.m[2][1].v, b.m[2][1].v);
    a.m[2][2].v = _mm256_sub_ps(a.m[2][2].v, b.m[2][2].v);

    return a;
}

static inline Matrix3f8 convertMat_zero(const unsigned int* idx, const Matrix3r* v, const unsigned char count = 8u)
{
    Matrix3f8 x;
    switch (count)
    {
    case 1u:
        x.m[0][0] = _mm256_setr_ps(v[idx[0]](0, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[0][1] = _mm256_setr_ps(v[idx[0]](0, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[0][2] = _mm256_setr_ps(v[idx[0]](0, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[1][0] = _mm256_setr_ps(v[idx[0]](1, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[1][1] = _mm256_setr_ps(v[idx[0]](1, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[1][2] = _mm256_setr_ps(v[idx[0]](1, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[2][0] = _mm256_setr_ps(v[idx[0]](2, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[2][1] = _mm256_setr_ps(v[idx[0]](2, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[2][2] = _mm256_setr_ps(v[idx[0]](2, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        break;
    case 2u:
        x.m[0][0] = _mm256_setr_ps(v[idx[0]](0, 0), v[idx[1]](0, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[0][1] = _mm256_setr_ps(v[idx[0]](0, 1), v[idx[1]](0, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[0][2] = _mm256_setr_ps(v[idx[0]](0, 2), v[idx[1]](0, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[1][0] = _mm256_setr_ps(v[idx[0]](1, 0), v[idx[1]](1, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[1][1] = _mm256_setr_ps(v[idx[0]](1, 1), v[idx[1]](1, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[1][2] = _mm256_setr_ps(v[idx[0]](1, 2), v[idx[1]](1, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[2][0] = _mm256_setr_ps(v[idx[0]](2, 0), v[idx[1]](2, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[2][1] = _mm256_setr_ps(v[idx[0]](2, 1), v[idx[1]](2, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[2][2] = _mm256_setr_ps(v[idx[0]](2, 2), v[idx[1]](2, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        break;
    case 3u:
        x.m[0][0] = _mm256_setr_ps(v[idx[0]](0, 0), v[idx[1]](0, 0), v[idx[2]](0, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[0][1] = _mm256_setr_ps(v[idx[0]](0, 1), v[idx[1]](0, 1), v[idx[2]](0, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[0][2] = _mm256_setr_ps(v[idx[0]](0, 2), v[idx[1]](0, 2), v[idx[2]](0, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[1][0] = _mm256_setr_ps(v[idx[0]](1, 0), v[idx[1]](1, 0), v[idx[2]](1, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[1][1] = _mm256_setr_ps(v[idx[0]](1, 1), v[idx[1]](1, 1), v[idx[2]](1, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[1][2] = _mm256_setr_ps(v[idx[0]](1, 2), v[idx[1]](1, 2), v[idx[2]](1, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[2][0] = _mm256_setr_ps(v[idx[0]](2, 0), v[idx[1]](2, 0), v[idx[2]](2, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[2][1] = _mm256_setr_ps(v[idx[0]](2, 1), v[idx[1]](2, 1), v[idx[2]](2, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[2][2] = _mm256_setr_ps(v[idx[0]](2, 2), v[idx[1]](2, 2), v[idx[2]](2, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f);
        break;
    case 4u:
        x.m[0][0] = _mm256_setr_ps(v[idx[0]](0, 0), v[idx[1]](0, 0), v[idx[2]](0, 0), v[idx[3]](0, 0), 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[0][1] = _mm256_setr_ps(v[idx[0]](0, 1), v[idx[1]](0, 1), v[idx[2]](0, 1), v[idx[3]](0, 1), 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[0][2] = _mm256_setr_ps(v[idx[0]](0, 2), v[idx[1]](0, 2), v[idx[2]](0, 2), v[idx[3]](0, 2), 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[1][0] = _mm256_setr_ps(v[idx[0]](1, 0), v[idx[1]](1, 0), v[idx[2]](1, 0), v[idx[3]](1, 0), 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[1][1] = _mm256_setr_ps(v[idx[0]](1, 1), v[idx[1]](1, 1), v[idx[2]](1, 1), v[idx[3]](1, 1), 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[1][2] = _mm256_setr_ps(v[idx[0]](1, 2), v[idx[1]](1, 2), v[idx[2]](1, 2), v[idx[3]](1, 2), 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[2][0] = _mm256_setr_ps(v[idx[0]](2, 0), v[idx[1]](2, 0), v[idx[2]](2, 0), v[idx[3]](2, 0), 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[2][1] = _mm256_setr_ps(v[idx[0]](2, 1), v[idx[1]](2, 1), v[idx[2]](2, 1), v[idx[3]](2, 1), 0.0f, 0.0f, 0.0f, 0.0f);
        x.m[2][2] = _mm256_setr_ps(v[idx[0]](2, 2), v[idx[1]](2, 2), v[idx[2]](2, 2), v[idx[3]](2, 2), 0.0f, 0.0f, 0.0f, 0.0f);
        break;
    case 5u:
        x.m[0][0] = _mm256_setr_ps(v[idx[0]](0, 0), v[idx[1]](0, 0), v[idx[2]](0, 0), v[idx[3]](0, 0), v[idx[4]](0, 0), 0.0f, 0.0f, 0.0f);
        x.m[0][1] = _mm256_setr_ps(v[idx[0]](0, 1), v[idx[1]](0, 1), v[idx[2]](0, 1), v[idx[3]](0, 1), v[idx[4]](0, 1), 0.0f, 0.0f, 0.0f);
        x.m[0][2] = _mm256_setr_ps(v[idx[0]](0, 2), v[idx[1]](0, 2), v[idx[2]](0, 2), v[idx[3]](0, 2), v[idx[4]](0, 2), 0.0f, 0.0f, 0.0f);
        x.m[1][0] = _mm256_setr_ps(v[idx[0]](1, 0), v[idx[1]](1, 0), v[idx[2]](1, 0), v[idx[3]](1, 0), v[idx[4]](1, 0), 0.0f, 0.0f, 0.0f);
        x.m[1][1] = _mm256_setr_ps(v[idx[0]](1, 1), v[idx[1]](1, 1), v[idx[2]](1, 1), v[idx[3]](1, 1), v[idx[4]](1, 1), 0.0f, 0.0f, 0.0f);
        x.m[1][2] = _mm256_setr_ps(v[idx[0]](1, 2), v[idx[1]](1, 2), v[idx[2]](1, 2), v[idx[3]](1, 2), v[idx[4]](1, 2), 0.0f, 0.0f, 0.0f);
        x.m[2][0] = _mm256_setr_ps(v[idx[0]](2, 0), v[idx[1]](2, 0), v[idx[2]](2, 0), v[idx[3]](2, 0), v[idx[4]](2, 0), 0.0f, 0.0f, 0.0f);
        x.m[2][1] = _mm256_setr_ps(v[idx[0]](2, 1), v[idx[1]](2, 1), v[idx[2]](2, 1), v[idx[3]](2, 1), v[idx[4]](2, 1), 0.0f, 0.0f, 0.0f);
        x.m[2][2] = _mm256_setr_ps(v[idx[0]](2, 2), v[idx[1]](2, 2), v[idx[2]](2, 2), v[idx[3]](2, 2), v[idx[4]](2, 2), 0.0f, 0.0f, 0.0f);
        break;
    case 6u:
        x.m[0][0] = _mm256_setr_ps(v[idx[0]](0, 0), v[idx[1]](0, 0), v[idx[2]](0, 0), v[idx[3]](0, 0), v[idx[4]](0, 0), v[idx[5]](0, 0), 0.0f, 0.0f);
        x.m[0][1] = _mm256_setr_ps(v[idx[0]](0, 1), v[idx[1]](0, 1), v[idx[2]](0, 1), v[idx[3]](0, 1), v[idx[4]](0, 1), v[idx[5]](0, 1), 0.0f, 0.0f);
        x.m[0][2] = _mm256_setr_ps(v[idx[0]](0, 2), v[idx[1]](0, 2), v[idx[2]](0, 2), v[idx[3]](0, 2), v[idx[4]](0, 2), v[idx[5]](0, 2), 0.0f, 0.0f);
        x.m[1][0] = _mm256_setr_ps(v[idx[0]](1, 0), v[idx[1]](1, 0), v[idx[2]](1, 0), v[idx[3]](1, 0), v[idx[4]](1, 0), v[idx[5]](1, 0), 0.0f, 0.0f);
        x.m[1][1] = _mm256_setr_ps(v[idx[0]](1, 1), v[idx[1]](1, 1), v[idx[2]](1, 1), v[idx[3]](1, 1), v[idx[4]](1, 1), v[idx[5]](1, 1), 0.0f, 0.0f);
        x.m[1][2] = _mm256_setr_ps(v[idx[0]](1, 2), v[idx[1]](1, 2), v[idx[2]](1, 2), v[idx[3]](1, 2), v[idx[4]](1, 2), v[idx[5]](1, 2), 0.0f, 0.0f);
        x.m[2][0] = _mm256_setr_ps(v[idx[0]](2, 0), v[idx[1]](2, 0), v[idx[2]](2, 0), v[idx[3]](2, 0), v[idx[4]](2, 0), v[idx[5]](2, 0), 0.0f, 0.0f);
        x.m[2][1] = _mm256_setr_ps(v[idx[0]](2, 1), v[idx[1]](2, 1), v[idx[2]](2, 1), v[idx[3]](2, 1), v[idx[4]](2, 1), v[idx[5]](2, 1), 0.0f, 0.0f);
        x.m[2][2] = _mm256_setr_ps(v[idx[0]](2, 2), v[idx[1]](2, 2), v[idx[2]](2, 2), v[idx[3]](2, 2), v[idx[4]](2, 2), v[idx[5]](2, 2), 0.0f, 0.0f);
        break;
    case 7u:
        x.m[0][0] = _mm256_setr_ps(v[idx[0]](0, 0), v[idx[1]](0, 0), v[idx[2]](0, 0), v[idx[3]](0, 0), v[idx[4]](0, 0), v[idx[5]](0, 0), v[idx[6]](0, 0), 0.0f);
        x.m[0][1] = _mm256_setr_ps(v[idx[0]](0, 1), v[idx[1]](0, 1), v[idx[2]](0, 1), v[idx[3]](0, 1), v[idx[4]](0, 1), v[idx[5]](0, 1), v[idx[6]](0, 1), 0.0f);
        x.m[0][2] = _mm256_setr_ps(v[idx[0]](0, 2), v[idx[1]](0, 2), v[idx[2]](0, 2), v[idx[3]](0, 2), v[idx[4]](0, 2), v[idx[5]](0, 2), v[idx[6]](0, 2), 0.0f);
        x.m[1][0] = _mm256_setr_ps(v[idx[0]](1, 0), v[idx[1]](1, 0), v[idx[2]](1, 0), v[idx[3]](1, 0), v[idx[4]](1, 0), v[idx[5]](1, 0), v[idx[6]](1, 0), 0.0f);
        x.m[1][1] = _mm256_setr_ps(v[idx[0]](1, 1), v[idx[1]](1, 1), v[idx[2]](1, 1), v[idx[3]](1, 1), v[idx[4]](1, 1), v[idx[5]](1, 1), v[idx[6]](1, 1), 0.0f);
        x.m[1][2] = _mm256_setr_ps(v[idx[0]](1, 2), v[idx[1]](1, 2), v[idx[2]](1, 2), v[idx[3]](1, 2), v[idx[4]](1, 2), v[idx[5]](1, 2), v[idx[6]](1, 2), 0.0f);
        x.m[2][0] = _mm256_setr_ps(v[idx[0]](2, 0), v[idx[1]](2, 0), v[idx[2]](2, 0), v[idx[3]](2, 0), v[idx[4]](2, 0), v[idx[5]](2, 0), v[idx[6]](2, 0), 0.0f);
        x.m[2][1] = _mm256_setr_ps(v[idx[0]](2, 1), v[idx[1]](2, 1), v[idx[2]](2, 1), v[idx[3]](2, 1), v[idx[4]](2, 1), v[idx[5]](2, 1), v[idx[6]](2, 1), 0.0f);
        x.m[2][2] = _mm256_setr_ps(v[idx[0]](2, 2), v[idx[1]](2, 2), v[idx[2]](2, 2), v[idx[3]](2, 2), v[idx[4]](2, 2), v[idx[5]](2, 2), v[idx[6]](2, 2), 0.0f);
        break;
    case 8u:
        x.m[0][0] = _mm256_setr_ps(v[idx[0]](0, 0), v[idx[1]](0, 0), v[idx[2]](0, 0), v[idx[3]](0, 0), v[idx[4]](0, 0), v[idx[5]](0, 0), v[idx[6]](0, 0), v[idx[7]](0, 0));
        x.m[0][1] = _mm256_setr_ps(v[idx[0]](0, 1), v[idx[1]](0, 1), v[idx[2]](0, 1), v[idx[3]](0, 1), v[idx[4]](0, 1), v[idx[5]](0, 1), v[idx[6]](0, 1), v[idx[7]](0, 1));
        x.m[0][2] = _mm256_setr_ps(v[idx[0]](0, 2), v[idx[1]](0, 2), v[idx[2]](0, 2), v[idx[3]](0, 2), v[idx[4]](0, 2), v[idx[5]](0, 2), v[idx[6]](0, 2), v[idx[7]](0, 2));
        x.m[1][0] = _mm256_setr_ps(v[idx[0]](1, 0), v[idx[1]](1, 0), v[idx[2]](1, 0), v[idx[3]](1, 0), v[idx[4]](1, 0), v[idx[5]](1, 0), v[idx[6]](1, 0), v[idx[7]](1, 0));
        x.m[1][1] = _mm256_setr_ps(v[idx[0]](1, 1), v[idx[1]](1, 1), v[idx[2]](1, 1), v[idx[3]](1, 1), v[idx[4]](1, 1), v[idx[5]](1, 1), v[idx[6]](1, 1), v[idx[7]](1, 1));
        x.m[1][2] = _mm256_setr_ps(v[idx[0]](1, 2), v[idx[1]](1, 2), v[idx[2]](1, 2), v[idx[3]](1, 2), v[idx[4]](1, 2), v[idx[5]](1, 2), v[idx[6]](1, 2), v[idx[7]](1, 2));
        x.m[2][0] = _mm256_setr_ps(v[idx[0]](2, 0), v[idx[1]](2, 0), v[idx[2]](2, 0), v[idx[3]](2, 0), v[idx[4]](2, 0), v[idx[5]](2, 0), v[idx[6]](2, 0), v[idx[7]](2, 0));
        x.m[2][1] = _mm256_setr_ps(v[idx[0]](2, 1), v[idx[1]](2, 1), v[idx[2]](2, 1), v[idx[3]](2, 1), v[idx[4]](2, 1), v[idx[5]](2, 1), v[idx[6]](2, 1), v[idx[7]](2, 1));
        x.m[2][2] = _mm256_setr_ps(v[idx[0]](2, 2), v[idx[1]](2, 2), v[idx[2]](2, 2), v[idx[3]](2, 2), v[idx[4]](2, 2), v[idx[5]](2, 2), v[idx[6]](2, 2), v[idx[7]](2, 2));
    }
    return x;
}


inline void dyadicProduct(const Vector3f8 &a, const Vector3f8 &b, Matrix3f8 &res)
{
    res.m[0][0] = _mm256_mul_ps(a.x().v, b.x().v); res.m[0][1] = _mm256_mul_ps(a.x().v, b.y().v); res.m[0][2] = _mm256_mul_ps(a.x().v, b.z().v);
    res.m[1][0] = _mm256_mul_ps(a.y().v, b.x().v); res.m[1][1] = _mm256_mul_ps(a.y().v, b.y().v); res.m[1][2] = _mm256_mul_ps(a.y().v, b.z().v);
    res.m[2][0] = _mm256_mul_ps(a.z().v, b.x().v); res.m[2][1] = _mm256_mul_ps(a.z().v, b.y().v); res.m[2][2] = _mm256_mul_ps(a.z().v, b.z().v);
}



// ----------------------------------------------------------------------------------------------
//4 dimensional vector of Scalar8f to represent 8 quaternions
class Quaternion8f
{
public:

    Scalarf8  q[4];

    inline Quaternion8f() { q[0] = 0.0; q[1] = 0.0; q[2] = 0.0; q[3] = 1.0; }

    inline Quaternion8f(Scalarf8 x, Scalarf8 y, Scalarf8 z, Scalarf8 w) {
        q[0] = x; q[1] = y; q[2] = z; q[3] = w;
    }

    inline Quaternion8f(Vector3f8& v) {
        q[0] = v[0]; q[1] = v[1]; q[2] = v[2]; q[3] = 0.0;
    }

    inline Scalarf8 & operator [] (int i) { return q[i]; }
    inline Scalarf8   operator [] (int i) const { return q[i]; }

    inline Scalarf8 & x() { return q[0]; }
    inline Scalarf8 & y() { return q[1]; }
    inline Scalarf8 & z() { return q[2]; }
    inline Scalarf8 & w() { return q[3]; }

    inline Scalarf8 x() const { return q[0]; }
    inline Scalarf8 y() const { return q[1]; }
    inline Scalarf8 z() const { return q[2]; }
    inline Scalarf8 w() const { return q[3]; }

    inline const Quaternion8f operator*(const Quaternion8f& a) const {
        return
            Quaternion8f(q[3] * a.q[0] + q[0] * a.q[3] + q[1] * a.q[2] - q[2] * a.q[1],
                q[3] * a.q[1] - q[0] * a.q[2] + q[1] * a.q[3] + q[2] * a.q[0],
                q[3] * a.q[2] + q[0] * a.q[1] - q[1] * a.q[0] + q[2] * a.q[3],
                q[3] * a.q[3] - q[0] * a.q[0] - q[1] * a.q[1] - q[2] * a.q[2]);
    }

    inline void toRotationMatrix(Matrix3f8& R)
    {
        const Scalarf8 tx = Scalarf8(2.0) * q[0];
        const Scalarf8 ty = Scalarf8(2.0) * q[1];
        const Scalarf8 tz = Scalarf8(2.0) * q[2];
        const Scalarf8 twx = tx*q[3];
        const Scalarf8 twy = ty*q[3];
        const Scalarf8 twz = tz*q[3];
        const Scalarf8 txx = tx*q[0];
        const Scalarf8 txy = ty*q[0];
        const Scalarf8 txz = tz*q[0];
        const Scalarf8 tyy = ty*q[1];
        const Scalarf8 tyz = tz*q[1];
        const Scalarf8 tzz = tz*q[2];

        R.m[0][0] = Scalarf8(1.0) - (tyy + tzz);
        R.m[0][1] = txy - twz;
        R.m[0][2] = txz + twy;
        R.m[1][0] = txy + twz;
        R.m[1][1] = Scalarf8(1.0) - (txx + tzz);
        R.m[1][2] = tyz - twx;
        R.m[2][0] = txz - twy;
        R.m[2][1] = tyz + twx;
        R.m[2][2] = Scalarf8(1.0) - (txx + tyy);
    }

    inline void toRotationMatrix(Vector3f8& R1, Vector3f8& R2, Vector3f8& R3)
    {
        const Scalarf8 tx = Scalarf8(2.0) * q[0];
        const Scalarf8 ty = Scalarf8(2.0) * q[1];
        const Scalarf8 tz = Scalarf8(2.0) * q[2];
        const Scalarf8 twx = tx*q[3];
        const Scalarf8 twy = ty*q[3];
        const Scalarf8 twz = tz*q[3];
        const Scalarf8 txx = tx*q[0];
        const Scalarf8 txy = ty*q[0];
        const Scalarf8 txz = tz*q[0];
        const Scalarf8 tyy = ty*q[1];
        const Scalarf8 tyz = tz*q[1];
        const Scalarf8 tzz = tz*q[2];

        R1[0] = Scalarf8(1.0) - (tyy + tzz);
        R2[0] = txy - twz;
        R3[0] = txz + twy;
        R1[1] = txy + twz;
        R2[1] = Scalarf8(1.0) - (txx + tzz);
        R3[1] = tyz - twx;
        R1[2] = txz - twy;
        R2[2] = tyz + twx;
        R3[2] = Scalarf8(1.0) - (txx + tyy);
    }

    inline void store(std::vector<Quaternionr>& qf) const
    {
        float x[8], y[8], z[8], w[8];
        q[0].store(x);
        q[1].store(y);
        q[2].store(z);
        q[3].store(w);

        for (int i = 0; i < 8; i++)
        {
            qf[i].x() = x[i];
            qf[i].y() = y[i];
            qf[i].z() = z[i];
            qf[i].w() = w[i];
        }
    }

    inline void store(Quaternionr *qf) const
    {
        float x[8], y[8], z[8], w[8];
        q[0].store(x);
        q[1].store(y);
        q[2].store(z);
        q[3].store(w);

        for (int i = 0; i < 8; i++)
        {
            qf[i].x() = x[i];
            qf[i].y() = y[i];
            qf[i].z() = z[i];
            qf[i].w() = w[i];
        }
    }

    inline void set(const Quaternionr *qf)
    {
        float x[8], y[8], z[8], w[8];
        for(int i=0; i<8; i++)
        {
            x[i] = static_cast<float>(qf[i].x());
            y[i] = static_cast<float>(qf[i].y());
            z[i] = static_cast<float>(qf[i].z());
            w[i] = static_cast<float>(qf[i].w());
        }
        Scalarf8 s;
        s.load(x);
        q[0] = s;
        s.load(y);
        q[1] = s;
        s.load(z);
        q[2] = s;
        s.load(w);
        q[3] = s;
    }
};

#else
#include <simd/simd.h>

// ----------------------------------------------------------------------------------------------
//vector of 8 float values to represent 8 scalars
class Scalarf8
{
public:
    simd::float8 v;

    Scalarf8() {}

    Scalarf8(float f)
    {
        v = f;
    }

    //  Scalarf8(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7) {
    //      v = _mm256_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7);
    //  }

    Scalarf8(Real f0, Real f1, Real f2, Real f3, Real f4, Real f5, Real f6, Real f7)
    {

        v = { (float)f0, (float)f1, (float)f2, (float)f3,
              (float)f4, (float)f5, (float)f6, (float)f7 };
    }

    Scalarf8(float const* p)
    {
        v = *(simd::packed::float8*)p;
    }

    Scalarf8(simd::float8 const& x)
    {
        v = x;
    }

    inline void setZero()
    {
        v = 0.0f;
    }

    Scalarf8& operator = (simd::float8 const& x)
    {
        v = x;
        return *this;
    }

    inline Scalarf8 sqrt() const
    {
        return simd::sqrt(v);
    }

    inline Scalarf8 rsqrt() const
    {
        return simd::rsqrt(v);
    }

    Scalarf8& load(float const* p) {
        v = *(simd::packed::float8*)p;
        return *this;
    }

    void store(float* p) const {
        * (simd::packed::float8*)p = v;
    }

    inline float reduce() const {
        return simd_reduce_add(v);
    }
};




inline Scalarf8 operator - (Scalarf8& a) {
    return -a.v;
}


static inline Scalarf8 multiplyAndAdd(const Scalarf8& a, const Scalarf8& b, const Scalarf8& c)
{
    return simd::fma(a.v, b.v, c.v);
}

static inline Scalarf8 multiplyAndSubtract(const Scalarf8& a, const Scalarf8& b, const Scalarf8& c)
{
    return simd::fma(a.v, b.v, -c.v);
}

static inline Scalarf8 operator + (Scalarf8 const & a, Scalarf8 const & b) {
    return a.v + b.v;
}

static inline Scalarf8 & operator += (Scalarf8 & a, Scalarf8 const & b) {
    a.v += b.v;
    return a;
}

static inline Scalarf8 operator - (Scalarf8 const & a, Scalarf8 const & b) {
    return a.v - b.v;
}

static inline Scalarf8 & operator -= (Scalarf8 & a, Scalarf8 const & b) {
    a.v -= b.v;
    return a;
}

static inline Scalarf8 operator * (Scalarf8 const & a, Scalarf8 const & b) {
    return a.v * b.v;
}

static inline Scalarf8 & operator *= (Scalarf8 & a, Scalarf8 const & b) {
    a.v *= b.v;
    return a;
}

static inline Scalarf8 operator / (Scalarf8 const & a, Scalarf8 const & b) {
    return a.v / b.v;
}

static inline Scalarf8 & operator /= (Scalarf8 & a, Scalarf8 const & b) {
    a.v /= b.v;
    return a;
}

static inline Scalarf8 operator == (Scalarf8 const & a, Scalarf8 const & b) {
    return simd::convert<float>(a.v == b.v);
}

static inline Scalarf8 operator != (Scalarf8 const & a, Scalarf8 const & b) {
    return simd::convert<float>(a.v != b.v);
}

static inline Scalarf8 operator < (Scalarf8 const & a, Scalarf8 const & b) {
    return simd::convert<float>(a.v < b.v);
}

static inline Scalarf8 operator <= (Scalarf8 const & a, Scalarf8 const & b) {
    return simd::convert<float>(a.v <= b.v);
}

static inline Scalarf8 operator > (Scalarf8 const & a, Scalarf8 const & b) {
    return simd::convert<float>(a.v > b.v);
}

static inline Scalarf8 operator >= (Scalarf8 const & a, Scalarf8 const & b) {
    return simd::convert<float>(a.v >= b.v);
}

static inline Scalarf8 min(Scalarf8 const & a, Scalarf8 const & b) {
    return simd::min(a.v, b.v);
}

static inline Scalarf8 max(Scalarf8 const & a, Scalarf8 const & b) {
    return simd::max(a.v, b.v);
}

static inline Scalarf8 abs(Scalarf8 const & a) {
    return simd::abs(a.v);
}

//does the same as for (int i = 0; i < 8; i++) result[i] = c[i] ? a[i] : b[i];
//the elemets in c must be either 0 (false) or 0xFFFFFFFF (true)
static inline Scalarf8 blend(Scalarf8 const & c, Scalarf8 const & a, Scalarf8 const & b) {
    return simd::select(b.v, a.v, simd::convert<int>(c.v));
}

static inline Scalarf8 convert_zero(const unsigned int *idx, const Real *x, const unsigned char count = 8u)
{
    Scalarf8 v;
    switch (count)
    {
    case 1u:
        v.v = { x[idx[0]], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; break;
    case 2u:
        v.v = { x[idx[0]], x[idx[1]], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; break;
    case 3u:
        v.v = { x[idx[0]], x[idx[1]], x[idx[2]], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; break;
    case 4u:
        v.v = { x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], 0.0f, 0.0f, 0.0f, 0.0f }; break;
    case 5u:
        v.v = { x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], x[idx[4]], 0.0f, 0.0f, 0.0f }; break;
    case 6u:
        v.v = { x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], x[idx[4]], x[idx[5]], 0.0f, 0.0f }; break;
    case 7u:
        v.v = { x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], x[idx[4]], x[idx[5]], x[idx[6]], 0.0f }; break;
    case 8u:
        v.v = { x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], x[idx[4]], x[idx[5]], x[idx[6]], x[idx[7]] }; break;
    }
    return v;
}

static inline Scalarf8 convert_zero(const Real x, const unsigned char count = 8u)
{
    Scalarf8 v;
    switch (count)
    {
    case 1u:
        v.v = { x, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; break;
    case 2u:
        v.v = { x, x, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; break;
    case 3u:
        v.v = { x, x, x, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; break;
    case 4u:
        v.v = { x, x, x, x, 0.0f, 0.0f, 0.0f, 0.0f }; break;
    case 5u:
        v.v = { x, x, x, x, x, 0.0f, 0.0f, 0.0f }; break;
    case 6u:
        v.v = { x, x, x, x, x, x, 0.0f, 0.0f }; break;
    case 7u:
        v.v = { x, x, x, x, x, x, x, 0.0f }; break;
    case 8u:
        v.v = { x, x, x, x, x, x, x, x }; break;
    }
    return v;
}

static inline Scalarf8 convert_one(const unsigned int *idx, const Real *x, const unsigned char count = 8u)
{
    Scalarf8 v;
    switch (count)
    {
    case 1u:
        v.v = { x[idx[0]], 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; break;
    case 2u:
        v.v = { x[idx[0]], x[idx[1]], 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; break;
    case 3u:
        v.v = { x[idx[0]], x[idx[1]], x[idx[2]], 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; break;
    case 4u:
        v.v = { x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], 1.0f, 1.0f, 1.0f, 1.0f }; break;
    case 5u:
        v.v = { x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], x[idx[4]], 1.0f, 1.0f, 1.0f }; break;
    case 6u:
        v.v = { x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], x[idx[4]], x[idx[5]], 1.0f, 1.0f }; break;
    case 7u:
        v.v = { x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], x[idx[4]], x[idx[5]], x[idx[6]], 1.0f }; break;
    case 8u:
        v.v = { x[idx[0]], x[idx[1]], x[idx[2]], x[idx[3]], x[idx[4]], x[idx[5]], x[idx[6]], x[idx[7]] }; break;
    }
    return v;
}


// ----------------------------------------------------------------------------------------------
//3 dimensional vector of Scalar8f to represent 8 3d vectors
class Vector3f8
{
public:

    Scalarf8 v[3];

    Vector3f8() {}
    Vector3f8(const bool ) { v[0] = Scalarf8(0.0f); v[1] = Scalarf8(0.0f); v[2] = Scalarf8(0.0f); }
    Vector3f8(const Scalarf8 &x, const Scalarf8 &y, const Scalarf8 &z) { v[0] = x; v[1] = y; v[2] = z; }
    Vector3f8(const Scalarf8 &x) { v[0] = v[1] = v[2] = x; }
    Vector3f8(const Vector3f &x)
    {
        v[0].v = x[0];
        v[1].v = x[1];
        v[2].v = x[2];
    }
    Vector3f8(const Vector3f &v0, const Vector3f &v1, const Vector3f &v2, const Vector3f &v3, const Vector3f &v4, const Vector3f &v5, const Vector3f &v6, const Vector3f &v7)
    {
        Scalarf8 x(v0[0], v1[0], v2[0], v3[0], v4[0], v5[0], v6[0], v7[0]);
        Scalarf8 y(v0[1], v1[1], v2[1], v3[1], v4[1], v5[1], v6[1], v7[1]);
        Scalarf8 z(v0[2], v1[2], v2[2], v3[2], v4[2], v5[2], v6[2], v7[2]);
        v[0] = x; v[1] = y; v[2] = z;
    }
    Vector3f8(Vector3f const *x)
    {
        Scalarf8 vx(x[0][0], x[1][0], x[2][0], x[3][0], x[4][0], x[5][0], x[6][0], x[7][0]);
        Scalarf8 vy(x[0][1], x[1][1], x[2][1], x[3][1], x[4][1], x[5][1], x[6][1], x[7][1]);
        Scalarf8 vz(x[0][2], x[1][2], x[2][2], x[3][2], x[4][2], x[5][2], x[6][2], x[7][2]);
        v[0] = vx; v[1] = vy; v[2] = vz;
    }

    inline void setZero()
    {
        v[0].v = 0.0f; v[1].v = 0.0f; v[2].v = 0.0f;
    }

    inline Scalarf8& operator [] (int i) { return v[i]; }
    inline const Scalarf8& operator [] (int i) const { return v[i]; }

    inline Scalarf8& x() { return v[0]; }
    inline Scalarf8& y() { return v[1]; }
    inline Scalarf8& z() { return v[2]; }

    inline const Scalarf8& x() const { return v[0]; }
    inline const Scalarf8& y() const { return v[1]; }
    inline const Scalarf8& z() const { return v[2]; }

    inline Scalarf8 dot(const Vector3f8& a) const {
        Scalarf8 res;
        res.v = simd::fma(v[0].v, a.v[0].v, simd::fma(v[1].v, a.v[1].v, v[2].v * a.v[2].v));
        return res;
    }

    //dot product
    inline Scalarf8 operator * (const Vector3f8& a) const {
        Scalarf8 res;
        res.v = simd::fma(v[0].v, a.v[0].v, simd::fma(v[1].v, a.v[1].v, v[2].v * a.v[2].v));
        return res;
    }

    inline void cross(const Vector3f8& a, const Vector3f8& b) {
        v[0] = a.v[1] * b.v[2] - a.v[2] * b.v[1];
        v[1] = a.v[2] * b.v[0] - a.v[0] * b.v[2];
        v[2] = a.v[0] * b.v[1] - a.v[1] * b.v[0];
    }

    //cross product
    inline const Vector3f8 operator % (const Vector3f8& a) const {
        return Vector3f8(v[1] * a.v[2] - v[2] * a.v[1],
            v[2] * a.v[0] - v[0] * a.v[2],
            v[0] * a.v[1] - v[1] * a.v[0]);
    }

    inline Vector3f8& operator *= (const Scalarf8 &s) {
        v[0] *= s;
        v[1] *= s;
        v[2] *= s;
        return *this;
    }

    inline const Vector3f8 operator / (const Scalarf8 &s) const {
        return Vector3f8(v[0] / s, v[1] / s, v[2] / s);
    }

    inline Vector3f8& operator /= (const Scalarf8 &s) {
        v[0] = v[0] / s;
        v[1] = v[1] / s;
        v[2] = v[2] / s;
        return *this;
    }

    inline const Vector3f8 operator - () const {
        return Vector3f8(Scalarf8(-1.0) * v[0], Scalarf8(-1.0) * v[1], Scalarf8(-1.0) * v[2]);
    }

    inline Scalarf8 squaredNorm() const {
        return simd::fma(v[0].v, v[0].v, simd::fma(v[1].v, v[1].v, v[2].v * v[2].v));
    }

    inline Scalarf8 norm() const
    {
        return simd::sqrt(simd::fma(v[0].v, v[0].v, simd::fma(v[1].v, v[1].v, v[2].v * v[2].v)));
    }

    inline void normalize()
    {
        const Scalarf8 norm = (v[0] * v[0] + v[1] * v[1] + v[2] * v[2]).sqrt();
        v[0] = v[0] / norm;
        v[1] = v[1] / norm;
        v[2] = v[2] / norm;
        *this = blend(norm >= Scalarf8(1e-5f), *this, Vector3f8(true));
    }

    //does the same as for (int i = 0; i < 8; i++) result[i] = c[i] ? a[i] : b[i];
    //the elements in c must be either 0 (false) or 0xFFFFFFFF (true)
    static inline Vector3f8 blend(Scalarf8 const & c, Vector3f8 const & a, Vector3f8 const & b) {
        Vector3f8 result;
        result.x() = simd::select(b.x().v, a.x().v, simd::convert<int>(c.v));
        result.y() = simd::select(b.y().v, a.y().v, simd::convert<int>(c.v));
        result.z() = simd::select(b.z().v, a.z().v, simd::convert<int>(c.v));
        return result;
    }

    inline void store(std::vector<Vector3r>& Vf) const
    {
        for (int i = 0; i < 3; i++)
        {
            float val[8];
            v[i].store(val);
            for (int k = 0; k < 8; k++)
                Vf[k][i] = val[k];
        }
    }

    inline void store(Vector3r* Vf) const
    {
        for (int i = 0; i < 3; i++)
        {
            float val[8];
            v[i].store(val);
            for (int k = 0; k < 8; k++)
                Vf[k][i] = val[k];
        }
    }

    inline Vector3r reduce() const {
        Vector3r res;
        res[0] = v[0].reduce();
        res[1] = v[1].reduce();
        res[2] = v[2].reduce();
        return res;
    }
};



inline Vector3f8 operator + (Vector3f8 const& a, Vector3f8 const& b) {
    Vector3f8 res;
    res.v[0].v = a[0].v + b[0].v;
    res.v[1].v = a[1].v + b[1].v;
    res.v[2].v = a[2].v + b[2].v;
    return res;
}

inline Vector3f8 operator - (Vector3f8 const& a, Vector3f8 const& b) {
    Vector3f8 res;
    res.v[0].v = a[0].v - b[0].v;
    res.v[1].v = a[1].v - b[1].v;
    res.v[2].v = a[2].v - b[2].v;
    return res;
}

inline Vector3f8& operator += (Vector3f8& a, Vector3f8 const& b) {
    a[0].v += b[0].v;
    a[1].v += b[1].v;
    a[2].v += b[2].v;
    return a;
}

inline Vector3f8& operator -= (Vector3f8& a, Vector3f8 const& b) {
    a[0].v -= b[0].v;
    a[1].v -= b[1].v;
    a[2].v -= b[2].v;
    return a;
}

inline Vector3f8 operator * (Vector3f8 const& a, const Scalarf8& s) {
    Vector3f8 res;
    res.v[0].v = a[0].v * s.v;
    res.v[1].v = a[1].v * s.v;
    res.v[2].v = a[2].v * s.v;
    return res;
}

inline Vector3f8 convertVec_zero(const unsigned int* idx, const Real* v, const unsigned char count = 8u)
{
    Vector3f8 x;
    switch (count)
    {
    case 1u:
        x.v[0].v = { v[3*idx[0]+0], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.v[1].v = { v[3*idx[0]+1], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.v[2].v = { v[3*idx[0]+2], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        break;
    case 2u:
        x.v[0].v = { v[3*idx[0]+0], v[3*idx[1]+0], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.v[1].v = { v[3*idx[0]+1], v[3*idx[1]+1], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.v[2].v = { v[3*idx[0]+2], v[3*idx[1]+2], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        break;
    case 3u:
        x.v[0].v = { v[3*idx[0]+0], v[3*idx[1]+0], v[3*idx[2]+0], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.v[1].v = { v[3*idx[0]+1], v[3*idx[1]+1], v[3*idx[2]+1], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.v[2].v = { v[3*idx[0]+2], v[3*idx[1]+2], v[3*idx[2]+2], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        break;
    case 4u:
        x.v[0].v = { v[3*idx[0]+0], v[3*idx[1]+0], v[3*idx[2]+0], v[3*idx[3]+0], 0.0f, 0.0f, 0.0f, 0.0f };
        x.v[1].v = { v[3*idx[0]+1], v[3*idx[1]+1], v[3*idx[2]+1], v[3*idx[3]+1], 0.0f, 0.0f, 0.0f, 0.0f };
        x.v[2].v = { v[3*idx[0]+2], v[3*idx[1]+2], v[3*idx[2]+2], v[3*idx[3]+2], 0.0f, 0.0f, 0.0f, 0.0f };
        break;
    case 5u:
        x.v[0].v = { v[3*idx[0]+0], v[3*idx[1]+0], v[3*idx[2]+0], v[3*idx[3]+0], v[3*idx[4]+0], 0.0f, 0.0f, 0.0f };
        x.v[1].v = { v[3*idx[0]+1], v[3*idx[1]+1], v[3*idx[2]+1], v[3*idx[3]+1], v[3*idx[4]+1], 0.0f, 0.0f, 0.0f };
        x.v[2].v = { v[3*idx[0]+2], v[3*idx[1]+2], v[3*idx[2]+2], v[3*idx[3]+2], v[3*idx[4]+2], 0.0f, 0.0f, 0.0f };
        break;
    case 6u:
        x.v[0].v = { v[3*idx[0]+0], v[3*idx[1]+0], v[3*idx[2]+0], v[3*idx[3]+0], v[3*idx[4]+0], v[3*idx[5]+0], 0.0f, 0.0f };
        x.v[1].v = { v[3*idx[0]+1], v[3*idx[1]+1], v[3*idx[2]+1], v[3*idx[3]+1], v[3*idx[4]+1], v[3*idx[5]+1], 0.0f, 0.0f };
        x.v[2].v = { v[3*idx[0]+2], v[3*idx[1]+2], v[3*idx[2]+2], v[3*idx[3]+2], v[3*idx[4]+2], v[3*idx[5]+2], 0.0f, 0.0f };
        break;
    case 7u:
        x.v[0].v = { v[3*idx[0]+0], v[3*idx[1]+0], v[3*idx[2]+0], v[3*idx[3]+0], v[3*idx[4]+0], v[3*idx[5]+0], v[3*idx[6]+0], 0.0f };
        x.v[1].v = { v[3*idx[0]+1], v[3*idx[1]+1], v[3*idx[2]+1], v[3*idx[3]+1], v[3*idx[4]+1], v[3*idx[5]+1], v[3*idx[6]+1], 0.0f };
        x.v[2].v = { v[3*idx[0]+2], v[3*idx[1]+2], v[3*idx[2]+2], v[3*idx[3]+2], v[3*idx[4]+2], v[3*idx[5]+2], v[3*idx[6]+2], 0.0f };
        break;
    case 8u:
        x.v[0].v = { v[3*idx[0]+0], v[3*idx[1]+0], v[3*idx[2]+0], v[3*idx[3]+0], v[3*idx[4]+0], v[3*idx[5]+0], v[3*idx[6]+0], v[3*idx[7]+0] };
        x.v[1].v = { v[3*idx[0]+1], v[3*idx[1]+1], v[3*idx[2]+1], v[3*idx[3]+1], v[3*idx[4]+1], v[3*idx[5]+1], v[3*idx[6]+1], v[3*idx[7]+1] };
        x.v[2].v = { v[3*idx[0]+2], v[3*idx[1]+2], v[3*idx[2]+2], v[3*idx[3]+2], v[3*idx[4]+2], v[3*idx[5]+2], v[3*idx[6]+2], v[3*idx[7]+2] };
    }
    return x;
}

static inline Vector3f8 convertVec_zero(const unsigned int *idx, const Vector3r *v, const unsigned char count = 8u)
{
    Vector3f8 x;
    switch (count)
    {
    case 1u:
        x.v[0].v = { v[idx[0]][0], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.v[1].v = { v[idx[0]][1], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.v[2].v = { v[idx[0]][2], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        break;
    case 2u:
        x.v[0].v = { v[idx[0]][0], v[idx[1]][0], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.v[1].v = { v[idx[0]][1], v[idx[1]][1], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.v[2].v = { v[idx[0]][2], v[idx[1]][2], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        break;
    case 3u:
        x.v[0].v = { v[idx[0]][0], v[idx[1]][0], v[idx[2]][0], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.v[1].v = { v[idx[0]][1], v[idx[1]][1], v[idx[2]][1], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.v[2].v = { v[idx[0]][2], v[idx[1]][2], v[idx[2]][2], 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        break;
    case 4u:
        x.v[0].v = { v[idx[0]][0], v[idx[1]][0], v[idx[2]][0], v[idx[3]][0], 0.0f, 0.0f, 0.0f, 0.0f };
        x.v[1].v = { v[idx[0]][1], v[idx[1]][1], v[idx[2]][1], v[idx[3]][1], 0.0f, 0.0f, 0.0f, 0.0f };
        x.v[2].v = { v[idx[0]][2], v[idx[1]][2], v[idx[2]][2], v[idx[3]][2], 0.0f, 0.0f, 0.0f, 0.0f };
        break;
    case 5u:
        x.v[0].v = { v[idx[0]][0], v[idx[1]][0], v[idx[2]][0], v[idx[3]][0], v[idx[4]][0], 0.0f, 0.0f, 0.0f };
        x.v[1].v = { v[idx[0]][1], v[idx[1]][1], v[idx[2]][1], v[idx[3]][1], v[idx[4]][1], 0.0f, 0.0f, 0.0f };
        x.v[2].v = { v[idx[0]][2], v[idx[1]][2], v[idx[2]][2], v[idx[3]][2], v[idx[4]][2], 0.0f, 0.0f, 0.0f };
        break;
    case 6u:
        x.v[0].v = { v[idx[0]][0], v[idx[1]][0], v[idx[2]][0], v[idx[3]][0], v[idx[4]][0], v[idx[5]][0], 0.0f, 0.0f };
        x.v[1].v = { v[idx[0]][1], v[idx[1]][1], v[idx[2]][1], v[idx[3]][1], v[idx[4]][1], v[idx[5]][1], 0.0f, 0.0f };
        x.v[2].v = { v[idx[0]][2], v[idx[1]][2], v[idx[2]][2], v[idx[3]][2], v[idx[4]][2], v[idx[5]][2], 0.0f, 0.0f };
        break;
    case 7u:
        x.v[0].v = { v[idx[0]][0], v[idx[1]][0], v[idx[2]][0], v[idx[3]][0], v[idx[4]][0], v[idx[5]][0], v[idx[6]][0], 0.0f };
        x.v[1].v = { v[idx[0]][1], v[idx[1]][1], v[idx[2]][1], v[idx[3]][1], v[idx[4]][1], v[idx[5]][1], v[idx[6]][1], 0.0f };
        x.v[2].v = { v[idx[0]][2], v[idx[1]][2], v[idx[2]][2], v[idx[3]][2], v[idx[4]][2], v[idx[5]][2], v[idx[6]][2], 0.0f };
        break;
    case 8u:
        x.v[0].v = { v[idx[0]][0], v[idx[1]][0], v[idx[2]][0], v[idx[3]][0], v[idx[4]][0], v[idx[5]][0], v[idx[6]][0], v[idx[7]][0] };
        x.v[1].v = { v[idx[0]][1], v[idx[1]][1], v[idx[2]][1], v[idx[3]][1], v[idx[4]][1], v[idx[5]][1], v[idx[6]][1], v[idx[7]][1] };
        x.v[2].v = { v[idx[0]][2], v[idx[1]][2], v[idx[2]][2], v[idx[3]][2], v[idx[4]][2], v[idx[5]][2], v[idx[6]][2], v[idx[7]][2] };
    }
    return x;
}

// ----------------------------------------------------------------------------------------------
//3x3 dimensional matrix of Scalar8f to represent 8 3x3 matrices
class Matrix3f8
{
public:
    Scalarf8 m[3][3];

    Matrix3f8() {  }

    Matrix3f8(const Matrix3f &x)
    {
        const Vector3f8 m1(x.col(0));
        const Vector3f8 m2(x.col(1));
        const Vector3f8 m3(x.col(2));
        m[0][0] = m1.x();
        m[1][0] = m1.y();
        m[2][0] = m1.z();

        m[0][1] = m2.x();
        m[1][1] = m2.y();
        m[2][1] = m2.z();

        m[0][2] = m3.x();
        m[1][2] = m3.y();
        m[2][2] = m3.z();
    }

    //constructor to create matrix from 3 column vectors
    Matrix3f8(const Vector3f8& m1, const Vector3f8& m2, const Vector3f8& m3)
    {
        m[0][0] = m1.x();
        m[1][0] = m1.y();
        m[2][0] = m1.z();

        m[0][1] = m2.x();
        m[1][1] = m2.y();
        m[2][1] = m2.z();

        m[0][2] = m3.x();
        m[1][2] = m3.y();
        m[2][2] = m3.z();
    }

    inline void setZero()
    {
        m[0][0].v = 0.0f;
        m[1][0].v = 0.0f;
        m[2][0].v = 0.0f;

        m[0][1].v = 0.0f;
        m[1][1].v = 0.0f;
        m[2][1].v = 0.0f;

        m[0][2].v = 0.0f;
        m[1][2].v = 0.0f;
        m[2][2].v = 0.0f;
    }

    inline Scalarf8& operator()(int i, int j) { return m[i][j]; }

    inline void setCol(int i, const Vector3f8& v)
    {
        m[0][i] = v.x();
        m[1][i] = v.y();
        m[2][i] = v.z();
    }

    inline void setCol(int i, const Scalarf8& x, const Scalarf8& y, const Scalarf8& z)
    {
        m[0][i] = x;
        m[1][i] = y;
        m[2][i] = z;
    }

    inline Matrix3f8 operator * (const Scalarf8 &b) const
    {
        Matrix3f8 A;

        A.m[0][0] = m[0][0] * b;
        A.m[0][1] = m[0][1] * b;
        A.m[0][2] = m[0][2] * b;

        A.m[1][0] = m[1][0] * b;
        A.m[1][1] = m[1][1] * b;
        A.m[1][2] = m[1][2] * b;

        A.m[2][0] = m[2][0] * b;
        A.m[2][1] = m[2][1] * b;
        A.m[2][2] = m[2][2] * b;

        return A;
    }

    inline Vector3f8 operator * (const Vector3f8 &b) const
    {
        Vector3f8 A;

        A.v[0] = m[0][0] * b.v[0] + m[0][1] * b.v[1] + m[0][2] * b.v[2];
        A.v[1] = m[1][0] * b.v[0] + m[1][1] * b.v[1] + m[1][2] * b.v[2];
        A.v[2] = m[2][0] * b.v[0] + m[2][1] * b.v[1] + m[2][2] * b.v[2];

        return A;
    }

    inline Matrix3f8 operator * (const Matrix3f8 &b) const
    {
        Matrix3f8 A;

        A.m[0][0] = m[0][0] * b.m[0][0] + m[0][1] * b.m[1][0] + m[0][2] * b.m[2][0];
        A.m[0][1] = m[0][0] * b.m[0][1] + m[0][1] * b.m[1][1] + m[0][2] * b.m[2][1];
        A.m[0][2] = m[0][0] * b.m[0][2] + m[0][1] * b.m[1][2] + m[0][2] * b.m[2][2];

        A.m[1][0] = m[1][0] * b.m[0][0] + m[1][1] * b.m[1][0] + m[1][2] * b.m[2][0];
        A.m[1][1] = m[1][0] * b.m[0][1] + m[1][1] * b.m[1][1] + m[1][2] * b.m[2][1];
        A.m[1][2] = m[1][0] * b.m[0][2] + m[1][1] * b.m[1][2] + m[1][2] * b.m[2][2];

        A.m[2][0] = m[2][0] * b.m[0][0] + m[2][1] * b.m[1][0] + m[2][2] * b.m[2][0];
        A.m[2][1] = m[2][0] * b.m[0][1] + m[2][1] * b.m[1][1] + m[2][2] * b.m[2][1];
        A.m[2][2] = m[2][0] * b.m[0][2] + m[2][1] * b.m[1][2] + m[2][2] * b.m[2][2];

        return A;
    }

    inline Matrix3f8& operator += (const Matrix3f8& a) {
        m[0][0] += a.m[0][0];
        m[1][0] += a.m[1][0];
        m[2][0] += a.m[2][0];

        m[0][1] += a.m[0][1];
        m[1][1] += a.m[1][1];
        m[2][1] += a.m[2][1];

        m[0][2] += a.m[0][2];
        m[1][2] += a.m[1][2];
        m[2][2] += a.m[2][2];
        return *this;
    }

    inline Matrix3f8 transpose() const
    {
        Matrix3f8 A;
        A.m[0][0] = m[0][0]; A.m[0][1] = m[1][0]; A.m[0][2] = m[2][0];
        A.m[1][0] = m[0][1]; A.m[1][1] = m[1][1]; A.m[1][2] = m[2][1];
        A.m[2][0] = m[0][2]; A.m[2][1] = m[1][2]; A.m[2][2] = m[2][2];

        return A;
    }

    inline Scalarf8 determinant() const
    {
        return  m[0][1] * m[1][2] * m[2][0] - m[0][2] * m[1][1] * m[2][0] + m[0][2] * m[1][0] * m[2][1]
              - m[0][0] * m[1][2] * m[2][1] - m[0][1] * m[1][0] * m[2][2] + m[0][0] * m[1][1] * m[2][2];
    }

    inline void store(std::vector<Matrix3r>& Mf) const
    {
        for (int i = 0; i < 3; i++)
        {
            for (int j = 0; j < 3; j++)
            {
                float val[8];
                m[i][j].store(val);
                for (int k = 0; k < 8; k++)
                    Mf[k](i, j) = val[k];
            }
        }
    }

    inline Matrix3r reduce() const {
        Matrix3r A;
        A(0, 0) = m[0][0].reduce();
        A(0, 1) = m[0][1].reduce();
        A(0, 2) = m[0][2].reduce();

        A(1, 0) = m[1][0].reduce();
        A(1, 1) = m[1][1].reduce();
        A(1, 2) = m[1][2].reduce();

        A(2, 0) = m[2][0].reduce();
        A(2, 1) = m[2][1].reduce();
        A(2, 2) = m[2][2].reduce();
        return A;
    }
};



inline Matrix3f8& operator -= (Matrix3f8& a, Matrix3f8 const& b) {
    a.m[0][0].v -= b.m[0][0].v;
    a.m[0][1].v -= b.m[0][1].v;
    a.m[0][2].v -= b.m[0][2].v;

    a.m[1][0].v -= b.m[1][0].v;
    a.m[1][1].v -= b.m[1][1].v;
    a.m[1][2].v -= b.m[1][2].v;

    a.m[2][0].v -= b.m[2][0].v;
    a.m[2][1].v -= b.m[2][1].v;
    a.m[2][2].v -= b.m[2][2].v;

    return a;
}

static inline Matrix3f8 convertMat_zero(const unsigned int* idx, const Matrix3r* v, const unsigned char count = 8u)
{
    Matrix3f8 x;
    switch (count)
    {
    case 1u:
        x.m[0][0] =  { v[idx[0]](0, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[0][1] =  { v[idx[0]](0, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[0][2] =  { v[idx[0]](0, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[1][0] =  { v[idx[0]](1, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[1][1] =  { v[idx[0]](1, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[1][2] =  { v[idx[0]](1, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[2][0] =  { v[idx[0]](2, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[2][1] =  { v[idx[0]](2, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[2][2] =  { v[idx[0]](2, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        break;
    case 2u:
        x.m[0][0] =  { v[idx[0]](0, 0), v[idx[1]](0, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[0][1] =  { v[idx[0]](0, 1), v[idx[1]](0, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[0][2] =  { v[idx[0]](0, 2), v[idx[1]](0, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[1][0] =  { v[idx[0]](1, 0), v[idx[1]](1, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[1][1] =  { v[idx[0]](1, 1), v[idx[1]](1, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[1][2] =  { v[idx[0]](1, 2), v[idx[1]](1, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[2][0] =  { v[idx[0]](2, 0), v[idx[1]](2, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[2][1] =  { v[idx[0]](2, 1), v[idx[1]](2, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[2][2] =  { v[idx[0]](2, 2), v[idx[1]](2, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        break;
    case 3u:
        x.m[0][0] =  { v[idx[0]](0, 0), v[idx[1]](0, 0), v[idx[2]](0, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[0][1] =  { v[idx[0]](0, 1), v[idx[1]](0, 1), v[idx[2]](0, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[0][2] =  { v[idx[0]](0, 2), v[idx[1]](0, 2), v[idx[2]](0, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[1][0] =  { v[idx[0]](1, 0), v[idx[1]](1, 0), v[idx[2]](1, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[1][1] =  { v[idx[0]](1, 1), v[idx[1]](1, 1), v[idx[2]](1, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[1][2] =  { v[idx[0]](1, 2), v[idx[1]](1, 2), v[idx[2]](1, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[2][0] =  { v[idx[0]](2, 0), v[idx[1]](2, 0), v[idx[2]](2, 0), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[2][1] =  { v[idx[0]](2, 1), v[idx[1]](2, 1), v[idx[2]](2, 1), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[2][2] =  { v[idx[0]](2, 2), v[idx[1]](2, 2), v[idx[2]](2, 2), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
        break;
    case 4u:
        x.m[0][0] =  { v[idx[0]](0, 0), v[idx[1]](0, 0), v[idx[2]](0, 0), v[idx[3]](0, 0), 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[0][1] =  { v[idx[0]](0, 1), v[idx[1]](0, 1), v[idx[2]](0, 1), v[idx[3]](0, 1), 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[0][2] =  { v[idx[0]](0, 2), v[idx[1]](0, 2), v[idx[2]](0, 2), v[idx[3]](0, 2), 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[1][0] =  { v[idx[0]](1, 0), v[idx[1]](1, 0), v[idx[2]](1, 0), v[idx[3]](1, 0), 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[1][1] =  { v[idx[0]](1, 1), v[idx[1]](1, 1), v[idx[2]](1, 1), v[idx[3]](1, 1), 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[1][2] =  { v[idx[0]](1, 2), v[idx[1]](1, 2), v[idx[2]](1, 2), v[idx[3]](1, 2), 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[2][0] =  { v[idx[0]](2, 0), v[idx[1]](2, 0), v[idx[2]](2, 0), v[idx[3]](2, 0), 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[2][1] =  { v[idx[0]](2, 1), v[idx[1]](2, 1), v[idx[2]](2, 1), v[idx[3]](2, 1), 0.0f, 0.0f, 0.0f, 0.0f };
        x.m[2][2] =  { v[idx[0]](2, 2), v[idx[1]](2, 2), v[idx[2]](2, 2), v[idx[3]](2, 2), 0.0f, 0.0f, 0.0f, 0.0f };
        break;
    case 5u:
        x.m[0][0] =  { v[idx[0]](0, 0), v[idx[1]](0, 0), v[idx[2]](0, 0), v[idx[3]](0, 0), v[idx[4]](0, 0), 0.0f, 0.0f, 0.0f };
        x.m[0][1] =  { v[idx[0]](0, 1), v[idx[1]](0, 1), v[idx[2]](0, 1), v[idx[3]](0, 1), v[idx[4]](0, 1), 0.0f, 0.0f, 0.0f };
        x.m[0][2] =  { v[idx[0]](0, 2), v[idx[1]](0, 2), v[idx[2]](0, 2), v[idx[3]](0, 2), v[idx[4]](0, 2), 0.0f, 0.0f, 0.0f };
        x.m[1][0] =  { v[idx[0]](1, 0), v[idx[1]](1, 0), v[idx[2]](1, 0), v[idx[3]](1, 0), v[idx[4]](1, 0), 0.0f, 0.0f, 0.0f };
        x.m[1][1] =  { v[idx[0]](1, 1), v[idx[1]](1, 1), v[idx[2]](1, 1), v[idx[3]](1, 1), v[idx[4]](1, 1), 0.0f, 0.0f, 0.0f };
        x.m[1][2] =  { v[idx[0]](1, 2), v[idx[1]](1, 2), v[idx[2]](1, 2), v[idx[3]](1, 2), v[idx[4]](1, 2), 0.0f, 0.0f, 0.0f };
        x.m[2][0] =  { v[idx[0]](2, 0), v[idx[1]](2, 0), v[idx[2]](2, 0), v[idx[3]](2, 0), v[idx[4]](2, 0), 0.0f, 0.0f, 0.0f };
        x.m[2][1] =  { v[idx[0]](2, 1), v[idx[1]](2, 1), v[idx[2]](2, 1), v[idx[3]](2, 1), v[idx[4]](2, 1), 0.0f, 0.0f, 0.0f };
        x.m[2][2] =  { v[idx[0]](2, 2), v[idx[1]](2, 2), v[idx[2]](2, 2), v[idx[3]](2, 2), v[idx[4]](2, 2), 0.0f, 0.0f, 0.0f };
        break;
    case 6u:
        x.m[0][0] =  { v[idx[0]](0, 0), v[idx[1]](0, 0), v[idx[2]](0, 0), v[idx[3]](0, 0), v[idx[4]](0, 0), v[idx[5]](0, 0), 0.0f, 0.0f };
        x.m[0][1] =  { v[idx[0]](0, 1), v[idx[1]](0, 1), v[idx[2]](0, 1), v[idx[3]](0, 1), v[idx[4]](0, 1), v[idx[5]](0, 1), 0.0f, 0.0f };
        x.m[0][2] =  { v[idx[0]](0, 2), v[idx[1]](0, 2), v[idx[2]](0, 2), v[idx[3]](0, 2), v[idx[4]](0, 2), v[idx[5]](0, 2), 0.0f, 0.0f };
        x.m[1][0] =  { v[idx[0]](1, 0), v[idx[1]](1, 0), v[idx[2]](1, 0), v[idx[3]](1, 0), v[idx[4]](1, 0), v[idx[5]](1, 0), 0.0f, 0.0f };
        x.m[1][1] =  { v[idx[0]](1, 1), v[idx[1]](1, 1), v[idx[2]](1, 1), v[idx[3]](1, 1), v[idx[4]](1, 1), v[idx[5]](1, 1), 0.0f, 0.0f };
        x.m[1][2] =  { v[idx[0]](1, 2), v[idx[1]](1, 2), v[idx[2]](1, 2), v[idx[3]](1, 2), v[idx[4]](1, 2), v[idx[5]](1, 2), 0.0f, 0.0f };
        x.m[2][0] =  { v[idx[0]](2, 0), v[idx[1]](2, 0), v[idx[2]](2, 0), v[idx[3]](2, 0), v[idx[4]](2, 0), v[idx[5]](2, 0), 0.0f, 0.0f };
        x.m[2][1] =  { v[idx[0]](2, 1), v[idx[1]](2, 1), v[idx[2]](2, 1), v[idx[3]](2, 1), v[idx[4]](2, 1), v[idx[5]](2, 1), 0.0f, 0.0f };
        x.m[2][2] =  { v[idx[0]](2, 2), v[idx[1]](2, 2), v[idx[2]](2, 2), v[idx[3]](2, 2), v[idx[4]](2, 2), v[idx[5]](2, 2), 0.0f, 0.0f };
        break;
    case 7u:
        x.m[0][0] =  { v[idx[0]](0, 0), v[idx[1]](0, 0), v[idx[2]](0, 0), v[idx[3]](0, 0), v[idx[4]](0, 0), v[idx[5]](0, 0), v[idx[6]](0, 0), 0.0f };
        x.m[0][1] =  { v[idx[0]](0, 1), v[idx[1]](0, 1), v[idx[2]](0, 1), v[idx[3]](0, 1), v[idx[4]](0, 1), v[idx[5]](0, 1), v[idx[6]](0, 1), 0.0f };
        x.m[0][2] =  { v[idx[0]](0, 2), v[idx[1]](0, 2), v[idx[2]](0, 2), v[idx[3]](0, 2), v[idx[4]](0, 2), v[idx[5]](0, 2), v[idx[6]](0, 2), 0.0f };
        x.m[1][0] =  { v[idx[0]](1, 0), v[idx[1]](1, 0), v[idx[2]](1, 0), v[idx[3]](1, 0), v[idx[4]](1, 0), v[idx[5]](1, 0), v[idx[6]](1, 0), 0.0f };
        x.m[1][1] =  { v[idx[0]](1, 1), v[idx[1]](1, 1), v[idx[2]](1, 1), v[idx[3]](1, 1), v[idx[4]](1, 1), v[idx[5]](1, 1), v[idx[6]](1, 1), 0.0f };
        x.m[1][2] =  { v[idx[0]](1, 2), v[idx[1]](1, 2), v[idx[2]](1, 2), v[idx[3]](1, 2), v[idx[4]](1, 2), v[idx[5]](1, 2), v[idx[6]](1, 2), 0.0f };
        x.m[2][0] =  { v[idx[0]](2, 0), v[idx[1]](2, 0), v[idx[2]](2, 0), v[idx[3]](2, 0), v[idx[4]](2, 0), v[idx[5]](2, 0), v[idx[6]](2, 0), 0.0f };
        x.m[2][1] =  { v[idx[0]](2, 1), v[idx[1]](2, 1), v[idx[2]](2, 1), v[idx[3]](2, 1), v[idx[4]](2, 1), v[idx[5]](2, 1), v[idx[6]](2, 1), 0.0f };
        x.m[2][2] =  { v[idx[0]](2, 2), v[idx[1]](2, 2), v[idx[2]](2, 2), v[idx[3]](2, 2), v[idx[4]](2, 2), v[idx[5]](2, 2), v[idx[6]](2, 2), 0.0f };
        break;
    case 8u:
        x.m[0][0] =  { v[idx[0]](0, 0), v[idx[1]](0, 0), v[idx[2]](0, 0), v[idx[3]](0, 0), v[idx[4]](0, 0), v[idx[5]](0, 0), v[idx[6]](0, 0), v[idx[7]](0, 0) };
        x.m[0][1] =  { v[idx[0]](0, 1), v[idx[1]](0, 1), v[idx[2]](0, 1), v[idx[3]](0, 1), v[idx[4]](0, 1), v[idx[5]](0, 1), v[idx[6]](0, 1), v[idx[7]](0, 1) };
        x.m[0][2] =  { v[idx[0]](0, 2), v[idx[1]](0, 2), v[idx[2]](0, 2), v[idx[3]](0, 2), v[idx[4]](0, 2), v[idx[5]](0, 2), v[idx[6]](0, 2), v[idx[7]](0, 2) };
        x.m[1][0] =  { v[idx[0]](1, 0), v[idx[1]](1, 0), v[idx[2]](1, 0), v[idx[3]](1, 0), v[idx[4]](1, 0), v[idx[5]](1, 0), v[idx[6]](1, 0), v[idx[7]](1, 0) };
        x.m[1][1] =  { v[idx[0]](1, 1), v[idx[1]](1, 1), v[idx[2]](1, 1), v[idx[3]](1, 1), v[idx[4]](1, 1), v[idx[5]](1, 1), v[idx[6]](1, 1), v[idx[7]](1, 1) };
        x.m[1][2] =  { v[idx[0]](1, 2), v[idx[1]](1, 2), v[idx[2]](1, 2), v[idx[3]](1, 2), v[idx[4]](1, 2), v[idx[5]](1, 2), v[idx[6]](1, 2), v[idx[7]](1, 2) };
        x.m[2][0] =  { v[idx[0]](2, 0), v[idx[1]](2, 0), v[idx[2]](2, 0), v[idx[3]](2, 0), v[idx[4]](2, 0), v[idx[5]](2, 0), v[idx[6]](2, 0), v[idx[7]](2, 0) };
        x.m[2][1] =  { v[idx[0]](2, 1), v[idx[1]](2, 1), v[idx[2]](2, 1), v[idx[3]](2, 1), v[idx[4]](2, 1), v[idx[5]](2, 1), v[idx[6]](2, 1), v[idx[7]](2, 1) };
        x.m[2][2] =  { v[idx[0]](2, 2), v[idx[1]](2, 2), v[idx[2]](2, 2), v[idx[3]](2, 2), v[idx[4]](2, 2), v[idx[5]](2, 2), v[idx[6]](2, 2), v[idx[7]](2, 2) };
    }
    return x;
}


inline void dyadicProduct(const Vector3f8 &a, const Vector3f8 &b, Matrix3f8 &res)
{
    res.m[0][0] = a.x().v * b.x().v; res.m[0][1] = a.x().v * b.y().v; res.m[0][2] = a.x().v * b.z().v;
    res.m[1][0] = a.y().v * b.x().v; res.m[1][1] = a.y().v * b.y().v; res.m[1][2] = a.y().v * b.z().v;
    res.m[2][0] = a.z().v * b.x().v; res.m[2][1] = a.z().v * b.y().v; res.m[2][2] = a.z().v * b.z().v;
}


// ----------------------------------------------------------------------------------------------
//4 dimensional vector of Scalar8f to represent 8 quaternions
class Quaternion8f
{
public:

    Scalarf8  q[4];

    inline Quaternion8f() { q[0] = 0.0; q[1] = 0.0; q[2] = 0.0; q[3] = 1.0; }

    inline Quaternion8f(Scalarf8 x, Scalarf8 y, Scalarf8 z, Scalarf8 w) {
        q[0] = x; q[1] = y; q[2] = z; q[3] = w;
    }

    inline Quaternion8f(Vector3f8& v) {
        q[0] = v[0]; q[1] = v[1]; q[2] = v[2]; q[3] = 0.0;
    }

    inline Scalarf8 & operator [] (int i) { return q[i]; }
    inline Scalarf8   operator [] (int i) const { return q[i]; }

    inline Scalarf8 & x() { return q[0]; }
    inline Scalarf8 & y() { return q[1]; }
    inline Scalarf8 & z() { return q[2]; }
    inline Scalarf8 & w() { return q[3]; }

    inline Scalarf8 x() const { return q[0]; }
    inline Scalarf8 y() const { return q[1]; }
    inline Scalarf8 z() const { return q[2]; }
    inline Scalarf8 w() const { return q[3]; }

    inline const Quaternion8f operator*(const Quaternion8f& a) const {
        return
            Quaternion8f(q[3] * a.q[0] + q[0] * a.q[3] + q[1] * a.q[2] - q[2] * a.q[1],
                q[3] * a.q[1] - q[0] * a.q[2] + q[1] * a.q[3] + q[2] * a.q[0],
                q[3] * a.q[2] + q[0] * a.q[1] - q[1] * a.q[0] + q[2] * a.q[3],
                q[3] * a.q[3] - q[0] * a.q[0] - q[1] * a.q[1] - q[2] * a.q[2]);
    }

    inline void toRotationMatrix(Matrix3f8& R)
    {
        const Scalarf8 tx = Scalarf8(2.0) * q[0];
        const Scalarf8 ty = Scalarf8(2.0) * q[1];
        const Scalarf8 tz = Scalarf8(2.0) * q[2];
        const Scalarf8 twx = tx*q[3];
        const Scalarf8 twy = ty*q[3];
        const Scalarf8 twz = tz*q[3];
        const Scalarf8 txx = tx*q[0];
        const Scalarf8 txy = ty*q[0];
        const Scalarf8 txz = tz*q[0];
        const Scalarf8 tyy = ty*q[1];
        const Scalarf8 tyz = tz*q[1];
        const Scalarf8 tzz = tz*q[2];

        R.m[0][0] = Scalarf8(1.0) - (tyy + tzz);
        R.m[0][1] = txy - twz;
        R.m[0][2] = txz + twy;
        R.m[1][0] = txy + twz;
        R.m[1][1] = Scalarf8(1.0) - (txx + tzz);
        R.m[1][2] = tyz - twx;
        R.m[2][0] = txz - twy;
        R.m[2][1] = tyz + twx;
        R.m[2][2] = Scalarf8(1.0) - (txx + tyy);
    }

    inline void toRotationMatrix(Vector3f8& R1, Vector3f8& R2, Vector3f8& R3)
    {
        const Scalarf8 tx = Scalarf8(2.0) * q[0];
        const Scalarf8 ty = Scalarf8(2.0) * q[1];
        const Scalarf8 tz = Scalarf8(2.0) * q[2];
        const Scalarf8 twx = tx*q[3];
        const Scalarf8 twy = ty*q[3];
        const Scalarf8 twz = tz*q[3];
        const Scalarf8 txx = tx*q[0];
        const Scalarf8 txy = ty*q[0];
        const Scalarf8 txz = tz*q[0];
        const Scalarf8 tyy = ty*q[1];
        const Scalarf8 tyz = tz*q[1];
        const Scalarf8 tzz = tz*q[2];

        R1[0] = Scalarf8(1.0) - (tyy + tzz);
        R2[0] = txy - twz;
        R3[0] = txz + twy;
        R1[1] = txy + twz;
        R2[1] = Scalarf8(1.0) - (txx + tzz);
        R3[1] = tyz - twx;
        R1[2] = txz - twy;
        R2[2] = tyz + twx;
        R3[2] = Scalarf8(1.0) - (txx + tyy);
    }

    inline void store(std::vector<Quaternionr>& qf) const
    {
        float x[8], y[8], z[8], w[8];
        q[0].store(x);
        q[1].store(y);
        q[2].store(z);
        q[3].store(w);

        for (int i = 0; i < 8; i++)
        {
            qf[i].x() = x[i];
            qf[i].y() = y[i];
            qf[i].z() = z[i];
            qf[i].w() = w[i];
        }
    }

    inline void store(Quaternionr *qf) const
    {
        float x[8], y[8], z[8], w[8];
        q[0].store(x);
        q[1].store(y);
        q[2].store(z);
        q[3].store(w);

        for (int i = 0; i < 8; i++)
        {
            qf[i].x() = x[i];
            qf[i].y() = y[i];
            qf[i].z() = z[i];
            qf[i].w() = w[i];
        }
    }

    inline void set(const Quaternionr *qf)
    {
        float x[8], y[8], z[8], w[8];
        for(int i=0; i<8; i++)
        {
            x[i] = static_cast<float>(qf[i].x());
            y[i] = static_cast<float>(qf[i].y());
            z[i] = static_cast<float>(qf[i].z());
            w[i] = static_cast<float>(qf[i].w());
        }
        Scalarf8 s;
        s.load(x);
        q[0] = s;
        s.load(y);
        q[1] = s;
        s.load(z);
        q[2] = s;
        s.load(w);
        q[3] = s;
    }
};
#endif



// ----------------------------------------------------------------------------------------------
//alligned allocator so that vectorized types can be used in std containers
//from: https://stackoverflow.com/questions/8456236/how-is-a-vectors-data-aligned
template <typename T, std::size_t N = 32>
class AlignmentAllocator {
public:
    typedef T value_type;
    typedef std::size_t size_type;
    typedef std::ptrdiff_t difference_type;

    typedef T * pointer;
    typedef const T * const_pointer;

    typedef T & reference;
    typedef const T & const_reference;

public:
    inline AlignmentAllocator() throw () { }

    template <typename T2>
    inline AlignmentAllocator(const AlignmentAllocator<T2, N> &) throw () { }

    inline ~AlignmentAllocator() throw () { }

    inline pointer adress(reference r) {
        return &r;
    }

    inline const_pointer adress(const_reference r) const {
        return &r;
    }

    inline pointer allocate(size_type n) {
#ifdef _WIN32
        return (pointer)_aligned_malloc(n * sizeof(value_type), N);
#elif defined(__linux__) || defined(__APPLE__)
        // NB! Argument order is opposite from MSVC/Windows
        return (pointer) aligned_alloc(N, n * sizeof(value_type));
#else
#error "Unknown platform"
#endif
    }

    inline void deallocate(pointer p, size_type) {
#ifdef _WIN32
        _aligned_free(p);
#elif defined(__linux__) || defined(__APPLE__)
        free(p);
#else
#error "Unknown platform"
#endif
    }

    inline void construct(pointer p, const value_type & wert) {
        new (p) value_type(wert);
    }

    inline void destroy(pointer p) {
        p->~value_type();
    }

    inline size_type max_size() const throw () {
        return size_type(-1) / sizeof(value_type);
    }

    template <typename T2>
    struct rebind {
        typedef AlignmentAllocator<T2, N> other;
    };

    bool operator!=(const AlignmentAllocator<T, N>& other) const {
        return !(*this == other);
    }

    // Returns true if and only if storage allocated from *this
    // can be deallocated from other, and vice versa.
    // Always returns true for stateless allocators.
    bool operator==(const AlignmentAllocator<T, N>& other) const {
        return true;
    }
};

namespace SPH
{
    class MathFunctions_AVX
    {
    public:
        static void APD_Newton_AVX(const Vector3f8& F1, const Vector3f8& F2, const Vector3f8& F3, Quaternion8f& q)
        {
            //one iteration is sufficient for plausible results
            for (int it = 0; it < 1; it++)
            {
                //transform quaternion to rotation matrix
                Matrix3f8 R;
                q.toRotationMatrix(R);

                //columns of B = RT * F
                Vector3f8 B0 = R.transpose() * F1;
                Vector3f8 B1 = R.transpose() * F2;
                Vector3f8 B2 = R.transpose() * F3;

                Vector3f8 gradient(B2[1] - B1[2], B0[2] - B2[0], B1[0] - B0[1]);

                //compute Hessian, use the fact that it is symmetric
                Scalarf8 h00 = B1[1] + B2[2];
                Scalarf8 h11 = B0[0] + B2[2];
                Scalarf8 h22 = B0[0] + B1[1];
                Scalarf8 h01 = Scalarf8(-0.5) * (B1[0] + B0[1]);
                Scalarf8 h02 = Scalarf8(-0.5) * (B2[0] + B0[2]);
                Scalarf8 h12 = Scalarf8(-0.5) * (B2[1] + B1[2]);

                Scalarf8 detH = Scalarf8(-1.0) * h02 * h02 * h11 + Scalarf8(2.0) * h01 * h02 * h12 - h00 * h12 * h12 - h01 * h01 * h22 + h00 * h11 * h22;

                Vector3f8 omega;
                //compute symmetric inverse
                const Scalarf8 factor = Scalarf8(-0.25) / detH;
                omega[0] = (h11 * h22 - h12 * h12) * gradient[0]
                    + (h02 * h12 - h01 * h22) * gradient[1]
                    + (h01 * h12 - h02 * h11) * gradient[2];
                omega[0] *= factor;

                omega[1] = (h02 * h12 - h01 * h22) * gradient[0]
                    + (h00 * h22 - h02 * h02) * gradient[1]
                    + (h01 * h02 - h00 * h12) * gradient[2];
                omega[1] *= factor;

                omega[2] = (h01 * h12 - h02 * h11) * gradient[0]
                    + (h01 * h02 - h00 * h12) * gradient[1]
                    + (h00 * h11 - h01 * h01) * gradient[2];
                omega[2] *= factor;

                omega = Vector3f8::blend(abs(detH) < 1.0e-9f, gradient * Scalarf8(-1.0), omega);    //if det(H) = 0 use gradient descent, never happened in our tests, could also be removed

                //instead of clamping just use gradient descent. also works fine and does not require the norm
                //Scalarf8 useGD = blend(omega * gradient > Scalarf8(0.0), Scalarf8(1.0), Scalarf8(-1.0));
                //omega = Vector3f8::blend(useGD > Scalarf8(0.0), gradient * Scalarf8(-0.125), omega);
                omega = Vector3f8::blend(omega * gradient > Scalarf8(0.0), gradient * Scalarf8(-0.125), omega);

                Scalarf8 l_omega2 = omega.squaredNorm();
                const Scalarf8 w = (1.0 - l_omega2) / (1.0 + l_omega2);
                const Vector3f8 vec = omega * (2.0 / (1.0 + l_omega2));
                q = q * Quaternion8f(vec.x(), vec.y(), vec.z(), w);     //no normalization needed because the Cayley map returs a unit quaternion
            }
        }
    };
}

#endif