From 6c5503ce59068506f34d719cefe91edc399d717a Mon Sep 17 00:00:00 2001
From: Nick Gasson <nick@nickg.me.uk>
Date: Sat, 15 Jan 2011 14:41:15 +0000
Subject: [PATCH] Use packed vector representation

---
 CMakeLists.txt      |   2 +-
 include/Maths.hpp   | 149 +++++++++++++++++++---------
 src/Editor.cpp      |  27 ++---
 src/Map.cpp         |   2 +-
 src/Mesh.cpp        |   2 +-
 src/Points.cpp      |  35 +++----
 tools/MathsTest.cpp | 237 +++++++++++++++++++++++++++++++++++++++++---
 7 files changed, 361 insertions(+), 93 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e147078..8c246c8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,7 +42,7 @@ include_directories (include ${FREETYPE_INCLUDE_DIRS})
 include_directories (include ${CMAKE_CURRENT_BINARY_DIR})
 
 if (NOT WIN32) # Unix
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -ffast-math")
   # "-Wconversion -Werror"
   if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "i686")
     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2 -mfpmath=sse")
diff --git a/include/Maths.hpp b/include/Maths.hpp
index 0d425ba..0f30f76 100644
--- a/include/Maths.hpp
+++ b/include/Maths.hpp
@@ -24,28 +24,75 @@
 #include <ostream>
 #include <cassert>
 
-#if 0
 template <typename T, int N>
-union Packed;
+struct Packed;
 
 template <>
-union Packed<float, 4> {
-   int __attribute__((mode(V4SF))) packed;
-   float unpacked[4];
+struct Packed<float, 4> {
+   typedef float __attribute__((vector_size (16))) Type;
 };
 
 template <>
-union Packed<float, 3> {
-   int __attribute__((mode(V3SF))) packed;
-   float unpacked[3];
+struct Packed<float, 3> {
+   typedef float __attribute__((vector_size (16))) Type;
+
+   static inline Type pack(float a, float b, float c)
+   {
+      union {
+         Type p;
+         float f[4];
+      } u;   
+      u.f[0] = a;
+      u.f[1] = b;
+      u.f[2] = c;
+      u.f[3] = 0.0f;
+      return u.p;
+   }
+};
+
+template <>
+struct Packed<int, 3> {
+   typedef int __attribute__((vector_size (16))) Type;
+   
+   static inline Type pack(int a, int b, int c)
+   {
+      union Union {
+         Type p;
+         int i[4];
+      } u;
+      u.i[0] = a;
+      u.i[1] = b;
+      u.i[2] = c;
+      u.i[3] = 0;
+      return u.p;
+   }
+};
+
+template <typename T>
+struct EqTolerance;
+
+template <>
+struct EqTolerance<float> {
+   static const float Value = 0.01f;
+};
+
+template <>
+struct EqTolerance<int> {
+   static const int Value = 0;
 };
-#endif
 
 // A generic 3D vector
 template <typename T>
 struct Vector {
-   inline Vector(T x, T y, T z) : x(x), y(y), z(z) {}
-   inline Vector() : x(0), y(0), z(0) {}
+   inline Vector(T x = 0, T y = 0, T z = 0)
+   {
+      packed = Packed<T, 3>::pack(x, y, z);
+   }
+
+   inline Vector(const typename Packed<T, 3>::Type& v)
+   {
+      packed = v;
+   }
 
    // Cross product
    inline Vector<T> operator*(const Vector<T>& v) const
@@ -59,70 +106,85 @@ struct Vector {
    // Multiply by a scalar
    inline Vector<T> operator*(T t) const
    {
-      return Vector<T>(x*t, y*t, z*t);
+      //return Vector<T>(comp.x*t, comp.y*t, comp.z*t);
+      return Vector<T>(packed * Packed<T, 3>::pack(t, t, t));
    }
 
    // Divide by a scalar
    inline Vector<T> operator/(T t) const
    {
-      return Vector<T>(x/t, y/t, z/t);
+      //return Vector<T>(comp.x/t, comp.y/t, comp.z/t);
+      return Vector<T>(packed / Packed<T, 3>::pack(t, t, t));
    }
 
    // Scalar product
    inline T dot(const Vector<T>&v) const
    {
-      return x*v.x + y*v.y + z*v.z;
+      //return comp.x*v.comp.x + comp.y*v.comp.y + comp.z*v.comp.z;
+      const Vector<T> tmp = packed * v.packed;
+      return tmp.x + tmp.y + tmp.z;
    }
 
    // Magnitude
    inline T length() const
    {
-      return sqrt(x*x + y*y + z*z);
+      const float prod = dot(*this);
+      return sqrt(prod);
    }
 
    inline Vector<T>& normalise()
    {
-      T m = length();
-      x /= m;
-      y /= m;
-      z /= m;
+      const T m = length();
+      //comp.x /= m;
+      //comp.y /= m;
+      //comp.z /= m;
+      packed /= Packed<T, 3>::pack(m, m, m);
       return *this;
    }
-
+   
    inline Vector<T> operator+(const Vector<T>& v) const
    {
-      return Vector<T>(x+v.x, y+v.y, z+v.z);
+      return Vector<T>(packed + v.packed);
    }
-
+   
    inline Vector<T>& operator+=(const Vector<T>& v)
    {
-      x += v.x;
-      y += v.y;
-      z += v.z;
+      packed += v.packed;
       return *this;
    }
    
    inline Vector<T> operator-(const Vector<T>& v) const
    {
-      return Vector<T>(x-v.x, y-v.y, z-v.z);
+      //return Vector<T>(comp.x-v.comp.x, comp.y-v.comp.y, comp.z-v.comp.z);
+      return Vector<T>(packed - v.packed);
    }
 
    inline Vector<T> operator-() const
    {
-      return Vector<T>(-x, -y, -z);
+      //return Vector<T>(-comp.x, -comp.y, -comp.z);
+      return Vector<T>(-packed);
    }
 
    inline Vector<T>& operator-=(const Vector<T>& v)
    {
-      x -= v.x;
-      y -= v.y;
-      z -= v.z;
+      packed -= v.packed;
       return *this;
    }
    
-   inline bool operator==(const Vector<T>& v) const
+   inline bool operator==(const Vector<T>& rhs) const
    {
-      return x == v.x && y == v.y && z == v.z;
+      //return (abs(rhs.comp.x - comp.x) < delta)
+      //  && (abs(rhs.comp.y - comp.y) < delta)
+      //   && (abs(rhs.comp.z - comp.z) < delta);
+
+      const typename Packed<T, 3>::Type diff = rhs.packed - packed;
+      const T delta2 = EqTolerance<T>::Value * EqTolerance<T>::Value;
+
+      const Vector<T> squared = diff * diff;
+      
+      return (squared.x <= delta2)
+         && (squared.y <= delta2)
+         && (squared.z <= delta2);
    }
 
    inline bool operator!=(const Vector<T>& v) const
@@ -137,22 +199,21 @@ struct Vector {
              && (y < rhs.y
                  || (y == rhs.y && z < rhs.z)));
    }
-   
-   bool approx_equal(const Vector<T>& rhs, T delta) const
-   {
-      return (abs(rhs.x - x) < delta)
-         && (abs(rhs.y - y) < delta)
-         && (abs(rhs.z - z) < delta);
-   }
-   
-   T x, y, z;
+
+   union {
+      typename Packed<T, 3>::Type packed;
+      struct {
+         T x, y, z;
+      };
+   };
 };
 
+typedef Vector<float> VectorF;
+
 template <typename T>
 std::ostream& operator<<(std::ostream& s, const Vector<T>& v)
 {
-   return s << "[" << v.x << " " << v.y
-            << " " << v.z << "]";
+   return s << "[" << v.x << " " << v.y << " " << v.z << "]";
 }
 
 template <typename T>
@@ -161,8 +222,6 @@ inline Vector<T> make_vector(T x, T y, T z)
    return Vector<T>(x, y, z);
 }
 
-typedef Vector<float> VectorF;
-
 // Find a surface normal
 template <typename T>
 Vector<T> surface_normal(const Vector<T>& a, const Vector<T>& b,
diff --git a/src/Editor.cpp b/src/Editor.cpp
index ba5e51e..9db2ef6 100644
--- a/src/Editor.cpp
+++ b/src/Editor.cpp
@@ -249,11 +249,11 @@ bool Editor::draw_track_tile(Point<int> where, track::Direction axis)
    }
    else {
       bool level;
-      Vector<float> slope = map->slope_at(where, axis, level);
+      const VectorF slope = map->slope_at(where, axis, level);
 
       bool b_valid, a_valid;
-      Vector<float> slope_before = map->slope_before(where, axis, b_valid);
-      Vector<float> slope_after = map->slope_after(where, axis, a_valid);
+      const VectorF slope_before = map->slope_before(where, axis, b_valid);
+      const VectorF slope_after = map->slope_after(where, axis, a_valid);
                
       if (level) {
          const bool flat =
@@ -275,7 +275,8 @@ bool Editor::draw_track_tile(Point<int> where, track::Direction axis)
                        << " before=" << slope_before
                        << " after=" << slope_after;
 
-               map->set_track_at(where,
+               map->set_track_at(
+                  where,
                   make_slope_track(axis, slope, slope_before, slope_after));
 
                return true;
@@ -297,7 +298,7 @@ void Editor::draw_dragged_straight(const track::Direction& an_axis, int a_length
 
    for (int i = 0; i < a_length; i++) {
       draw_track_tile(where, an_axis);
-      
+
       where.x += an_axis.x;
       where.y += an_axis.z;
    }
@@ -561,12 +562,12 @@ void Editor::on_mouse_move(IPickBufferPtr pick_buffer, int x, int y,
    }
    else if (am_scrolling) {
       const float speed = 0.05f;
-      
-      my_position.x -= static_cast<float>(xrel) * speed;
-      my_position.z -= static_cast<float>(xrel) * speed;
-      
-      my_position.x += static_cast<float>(yrel) * speed;
-      my_position.z -= static_cast<float>(yrel) * speed;      
+
+      const VectorF xrelv(-xrel * speed, 0.0f, -xrel * speed);
+      const VectorF yrelv(yrel * speed, 0.0f, -yrel * speed);
+
+      my_position += xrelv;
+      my_position += yrelv;
    }
 }
 
@@ -598,10 +599,10 @@ void Editor::on_mouse_click(IPickBufferPtr pick_buffer, int x, int y,
       }
    }
    else if (a_button == MOUSE_WHEEL_UP) {
-      my_position.y -= 0.5f;
+      my_position -= VectorF(0.0f, 0.5f, 0.0f);
    }
    else if (a_button == MOUSE_WHEEL_DOWN) {
-      my_position.y += 0.5f;
+      my_position += VectorF(0.0f, 0.5f, 0.0f);
    }
 }
 
diff --git a/src/Map.cpp b/src/Map.cpp
index d302e1b..e1d8113 100644
--- a/src/Map.cpp
+++ b/src/Map.cpp
@@ -1168,7 +1168,7 @@ Vector<float> Map::slope_at(Point<int> where,
       v2 = height_map[indexes[1]].pos - height_map[indexes[2]].pos;
    }
 
-   level = v1.approx_equal(v2, 0.001f);
+   level = (v1 == v2);
 
 #if 0
    debug() << "slope_at where=" << where
diff --git a/src/Mesh.cpp b/src/Mesh.cpp
index da4d854..3901121 100644
--- a/src/Mesh.cpp
+++ b/src/Mesh.cpp
@@ -67,7 +67,7 @@ struct MeshBuffer : IMeshBuffer {
 
    static bool merge_vector(const Vector<float>& v1, const Vector<float>& v2)
    {
-      return v1.approx_equal(v2, 0.001f);
+      return v1 == v2;
    }
    
    vector<Vertex> vertices;
diff --git a/src/Points.cpp b/src/Points.cpp
index 5d3924f..947e01d 100644
--- a/src/Points.cpp
+++ b/src/Points.cpp
@@ -1,5 +1,5 @@
 //
-//  Copyright (C) 2009-2010  Nick Gasson
+//  Copyright (C) 2009-2011  Nick Gasson
 //
 //  This program is free software: you can redistribute it and/or modify
 //  it under the terms of the GNU General Public License as published by
@@ -21,6 +21,7 @@
 #include "ILogger.hpp"
 #include "BezierCurve.hpp"
 #include "Matrix.hpp"
+#include "OpenGLHelper.hpp"
 
 #include <cassert>
 
@@ -128,26 +129,26 @@ void Points::render_arrow() const
 
       for (float t = 0.0f; t < arrow_len; t += step) {
 
-         const Vector<float> v1 = curve(t);
-         const Vector<float> v2 = curve(t + step);
-
+         const VectorF v1 = curve(t);
+         const VectorF v2 = curve(t + step);
+         
          if (t >= arrow_len - step) {
             // Arrow head
             glBegin(GL_TRIANGLES);
             {
-               glVertex3f(v1.x, 0.0f, v1.z - head_width);
-               glVertex3f(v2.x, 0.0f, v2.z);
-               glVertex3f(v1.x, 0.0f, v1.z + head_width);
+               gl::vertex(make_vector(v1.x, 0.0f, v1.z - head_width));
+               gl::vertex(make_vector(v2.x, 0.0f, v2.z));
+               gl::vertex(make_vector(v1.x, 0.0f, v1.z + head_width));
             }
             glEnd();
          }
          else {
             glBegin(GL_QUADS);
             {
-               glVertex3f(v1.x, 0.0f, v1.z - 0.1f);
-               glVertex3f(v1.x, 0.0f, v1.z + 0.1f);
-               glVertex3f(v2.x, 0.0f, v2.z + 0.1f);
-               glVertex3f(v2.x, 0.0f, v2.z - 0.1f);
+               gl::vertex(make_vector(v1.x, 0.0f, v1.z - 0.1f));
+               gl::vertex(make_vector(v1.x, 0.0f, v1.z + 0.1f));
+               gl::vertex(make_vector(v2.x, 0.0f, v2.z + 0.1f));
+               gl::vertex(make_vector(v2.x, 0.0f, v2.z - 0.1f));
             }
             glEnd();
          }
@@ -217,11 +218,11 @@ void Points::merge(IMeshBufferPtr buf) const
 
    // Draw the curved sleepers
    for (float i = 0.25f; i < 1.0f; i += 0.08f) {
-      Vector<float> v = (reflected ? my_reflected_curve : my_curve)(i);
+      const VectorF v = (reflected ? my_reflected_curve : my_curve)(i);
 
-      Vector<float> t = make_vector(v.x - 0.5f, 0.0f, v.z);
-      Vector<float> soff = off + rotateY(t, y_angle);
-      const Vector<float> deriv =
+      const VectorF t = make_vector(v.x - 0.5f, 0.0f, v.z);
+      const VectorF soff = off + rotateY(t, y_angle);
+      const VectorF deriv =
          (reflected ? my_reflected_curve : my_curve).deriv(i);
       const float angle =
          rad_to_deg<float>(atanf(deriv.z / deriv.x));
@@ -354,11 +355,11 @@ void Points::transform(const track::TravelToken& a_token, float delta) const
       bool backwards = a_token.position == displaced_endpoint();
       
       const float f_value = backwards ? 1.0f - curve_delta : curve_delta;
-      const Vector<float> curve_value = my_curve(f_value);
+      const VectorF curve_value = my_curve(f_value);
       
       // Calculate the angle that the tangent to the curve at this
       // point makes to (one of) the axis at this point
-      const Vector<float> deriv = my_curve.deriv(f_value);
+      const VectorF deriv = my_curve.deriv(f_value);
       const float angle =
          rad_to_deg<float>(atanf(deriv.z / deriv.x));
 
diff --git a/tools/MathsTest.cpp b/tools/MathsTest.cpp
index e874a72..494c79f 100644
--- a/tools/MathsTest.cpp
+++ b/tools/MathsTest.cpp
@@ -1,6 +1,7 @@
 #include "Maths.hpp"
 
 #include <iostream>
+#include <cassert>
 
 /*
   Baseline:
@@ -32,6 +33,28 @@
   400be8:       f3 0f 7e 44 24 e8       movq   -0x18(%rsp),%xmm0
   400bee:       c3                      retq   
 
+  With packed float vector inside union:
+
+  0000000000400bf0 <vfadd>:
+  400bf0:       0f 28 07                movaps (%rdi),%xmm0
+  400bf3:       0f 58 06                addps  (%rsi),%xmm0
+  400bf6:       0f 29 44 24 a8          movaps %xmm0,-0x58(%rsp)
+  400bfb:       48 8b 44 24 a8          mov    -0x58(%rsp),%rax
+  400c00:       0f 29 44 24 d8          movaps %xmm0,-0x28(%rsp)
+  400c05:       48 89 44 24 a0          mov    %rax,-0x60(%rsp)
+  400c0a:       f3 0f 7e 4c 24 e0       movq   -0x20(%rsp),%xmm1
+  400c10:       f3 0f 7e 44 24 a0       movq   -0x60(%rsp),%xmm0
+  400c16:       c3                      retq   
+
+  Without the union:
+
+  0000000000400bf0 <vfadd>:
+  400bf0:       0f 28 07                movaps (%rdi),%xmm0
+  400bf3:       0f 58 06                addps  (%rsi),%xmm0
+  400bf6:       c3                      retq   
+  400bf7:       66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
+  400bfe:       00 00 
+  
  */
 
 extern "C" VectorF vfadd(const VectorF& a, const VectorF& b)
@@ -55,6 +78,16 @@ extern "C" VectorF vfadd(const VectorF& a, const VectorF& b)
   400d49:       f3 0f 7e 44 24 e8       movq   -0x18(%rsp),%xmm0
   400d4f:       c3                      retq   
 
+  Packed vector:
+
+  0000000000400dc0 <vfdiv>:
+  400dc0:       f3 0f 11 44 24 e8       movss  %xmm0,-0x18(%rsp)
+  400dc6:       f3 0f 11 44 24 ec       movss  %xmm0,-0x14(%rsp)
+  400dcc:       f3 0f 11 44 24 f0       movss  %xmm0,-0x10(%rsp)
+  400dd2:       0f 28 07                movaps (%rdi),%xmm0
+  400dd5:       0f 5e 44 24 e8          divps  -0x18(%rsp),%xmm0
+  400dda:       c3                      retq   
+  
  */
 
 extern "C" VectorF vfdiv(const VectorF& a, float f)
@@ -75,6 +108,17 @@ extern "C" VectorF vfdiv(const VectorF& a, float f)
   400d6b:       f3 0f 59 4e 08          mulss  0x8(%rsi),%xmm1
   400d70:       f3 0f 58 c1             addss  %xmm1,%xmm0
   400d74:       c3                      retq
+
+  Packed vector:
+
+  0000000000400de0 <vfdot>:
+  400de0:       0f 28 07                movaps (%rdi),%xmm0
+  400de3:       0f 59 06                mulps  (%rsi),%xmm0
+  400de6:       0f 29 44 24 e8          movaps %xmm0,-0x18(%rsp)
+  400deb:       f3 0f 10 44 24 e8       movss  -0x18(%rsp),%xmm0
+  400df1:       f3 0f 58 44 24 ec       addss  -0x14(%rsp),%xmm0
+  400df7:       f3 0f 58 44 24 f0       addss  -0x10(%rsp),%xmm0
+  400dfd:       c3                      retq   
   
 */
 
@@ -86,16 +130,47 @@ extern "C" float vfdot(const VectorF& a, const VectorF& b)
 /*
   Baseline:
 
-  0000000000400d50 <vfdot>:
-  400d50:       f3 0f 10 07             movss  (%rdi),%xmm0
-  400d54:       f3 0f 10 4f 04          movss  0x4(%rdi),%xmm1
-  400d59:       f3 0f 59 06             mulss  (%rsi),%xmm0
-  400d5d:       f3 0f 59 4e 04          mulss  0x4(%rsi),%xmm1
-  400d62:       f3 0f 58 c1             addss  %xmm1,%xmm0
-  400d66:       f3 0f 10 4f 08          movss  0x8(%rdi),%xmm1
-  400d6b:       f3 0f 59 4e 08          mulss  0x8(%rsi),%xmm1
-  400d70:       f3 0f 58 c1             addss  %xmm1,%xmm0
-  400d74:       c3                      retq
+  0000000000400ea0 <vflen>:
+  400ea0:       f3 0f 10 07             movss  (%rdi),%xmm0
+  400ea4:       f3 0f 10 57 04          movss  0x4(%rdi),%xmm2
+  400ea9:       f3 0f 59 c0             mulss  %xmm0,%xmm0
+  400ead:       f3 0f 59 d2             mulss  %xmm2,%xmm2
+  400eb1:       f3 0f 10 4f 08          movss  0x8(%rdi),%xmm1
+  400eb6:       f3 0f 59 c9             mulss  %xmm1,%xmm1
+  400eba:       f3 0f 58 c2             addss  %xmm2,%xmm0
+  400ebe:       f3 0f 58 c1             addss  %xmm1,%xmm0
+  400ec2:       f3 0f 51 c0             sqrtss %xmm0,%xmm0
+  400ec6:       c3                      retq
+
+  Packed vector:
+
+  0000000000400ee0 <vflen>:
+  400ee0:       48 83 ec 18             sub    $0x18,%rsp
+  400ee4:       0f 28 07                movaps (%rdi),%xmm0
+  400ee7:       0f 59 c0                mulps  %xmm0,%xmm0
+  400eea:       0f 29 04 24             movaps %xmm0,(%rsp)
+  400eee:       f3 0f 10 0c 24          movss  (%rsp),%xmm1
+  400ef3:       f3 0f 58 4c 24 04       addss  0x4(%rsp),%xmm1
+  400ef9:       f3 0f 58 4c 24 08       addss  0x8(%rsp),%xmm1
+  400eff:       f3 0f 51 c1             sqrtss %xmm1,%xmm0
+  400f03:       0f 2e c0                ucomiss %xmm0,%xmm0
+  400f06:       7a 02                   jp     400f0a <vflen+0x2a>
+  400f08:       74 08                   je     400f12 <vflen+0x32>
+  400f0a:       0f 28 c1                movaps %xmm1,%xmm0
+  400f0d:       e8 96 fd ff ff          callq  400ca8 <sqrtf@plt>
+  400f12:       48 83 c4 18             add    $0x18,%rsp
+  400f16:       c3                      retq
+
+  -ffast-math:
+
+  400e50:       0f 28 07                movaps (%rdi),%xmm0
+  400e53:       0f 59 c0                mulps  %xmm0,%xmm0
+  400e56:       0f 29 44 24 e8          movaps %xmm0,-0x18(%rsp)
+  400e5b:       f3 0f 10 44 24 ec       movss  -0x14(%rsp),%xmm0
+  400e61:       f3 0f 58 44 24 e8       addss  -0x18(%rsp),%xmm0
+  400e67:       f3 0f 58 44 24 f0       addss  -0x10(%rsp),%xmm0
+  400e6d:       f3 0f 51 c0             sqrtss %xmm0,%xmm0
+  400e71:       c3                      retq
   
 */
 
@@ -126,7 +201,55 @@ extern "C" float vflen(const VectorF& a)
   400e3b:       f3 0f 5e c8             divss  %xmm0,%xmm1
   400e3f:       f3 0f 11 57 04          movss  %xmm2,0x4(%rdi)
   400e44:       f3 0f 11 4f 08          movss  %xmm1,0x8(%rdi)
-  400e49:       c3                      retq   
+  400e49:       c3                      retq
+
+  Packed vector:
+  
+  0000000000400e80 <vfnorm>:
+  400e80:       53                      push   %rbx
+  400e81:       48 89 fb                mov    %rdi,%rbx
+  400e84:       48 83 ec 20             sub    $0x20,%rsp
+  400e88:       0f 28 17                movaps (%rdi),%xmm2
+  400e8b:       0f 28 c2                movaps %xmm2,%xmm0
+  400e8e:       0f 59 c2                mulps  %xmm2,%xmm0
+  400e91:       0f 29 04 24             movaps %xmm0,(%rsp)
+  400e95:       f3 0f 10 0c 24          movss  (%rsp),%xmm1
+  400e9a:       f3 0f 58 4c 24 04       addss  0x4(%rsp),%xmm1
+  400ea0:       f3 0f 58 4c 24 08       addss  0x8(%rsp),%xmm1
+  400ea6:       f3 0f 51 c1             sqrtss %xmm1,%xmm0
+  400eaa:       0f 2e c0                ucomiss %xmm0,%xmm0
+  400ead:       7a 02                   jp     400eb1 <vfnorm+0x31>
+  400eaf:       74 0b                   je     400ebc <vfnorm+0x3c>
+  400eb1:       0f 28 c1                movaps %xmm1,%xmm0
+  400eb4:       e8 ef fd ff ff          callq  400ca8 <sqrtf@plt>
+  400eb9:       0f 28 13                movaps (%rbx),%xmm2
+  400ebc:       f3 0f 11 44 24 10       movss  %xmm0,0x10(%rsp)
+  400ec2:       f3 0f 11 44 24 14       movss  %xmm0,0x14(%rsp)
+  400ec8:       f3 0f 11 44 24 18       movss  %xmm0,0x18(%rsp)
+  400ece:       0f 5e 54 24 10          divps  0x10(%rsp),%xmm2
+  400ed3:       0f 29 13                movaps %xmm2,(%rbx)
+  400ed6:       48 83 c4 20             add    $0x20,%rsp
+  400eda:       5b                      pop    %rbx
+  400edb:       c3                      retq   
+  400edc:       0f 1f 40 00             nopl   0x0(%rax)
+
+  -ffast-math:
+
+  0000000000400e10 <vfnorm>:
+  400e10:       0f 28 0f                movaps (%rdi),%xmm1
+  400e13:       0f 28 c1                movaps %xmm1,%xmm0
+  400e16:       0f 59 c1                mulps  %xmm1,%xmm0
+  400e19:       0f 29 44 24 d8          movaps %xmm0,-0x28(%rsp)
+  400e1e:       f3 0f 10 44 24 dc       movss  -0x24(%rsp),%xmm0
+  400e24:       f3 0f 58 44 24 d8       addss  -0x28(%rsp),%xmm0
+  400e2a:       f3 0f 58 44 24 e0       addss  -0x20(%rsp),%xmm0
+  400e30:       f3 0f 51 c0             sqrtss %xmm0,%xmm0
+  400e34:       f3 0f 11 44 24 e8       movss  %xmm0,-0x18(%rsp)
+  400e3a:       f3 0f 11 44 24 ec       movss  %xmm0,-0x14(%rsp)
+  400e40:       f3 0f 11 44 24 f0       movss  %xmm0,-0x10(%rsp)
+  400e46:       0f 5e 4c 24 e8          divps  -0x18(%rsp),%xmm1
+  400e4b:       0f 29 0f                movaps %xmm1,(%rdi)
+  400e4e:       c3                      retq
 */
 
 extern "C" void vfnorm(VectorF& a)
@@ -159,11 +282,95 @@ extern "C" void vfnorm(VectorF& a)
   400dbe:       66 90                   xchg   %ax,%ax
   400dc0:       31 c0                   xor    %eax,%eax
   400dc2:       c3                      retq
+
+  Packed vector:
+
+  0000000000400e00 <vfeq>:
+  400e00:       0f 28 0e                movaps (%rsi),%xmm1
+  400e03:       f3 0f 10 15 45 05 00    movss  0x545(%rip),%xmm2        # 401350 <_ZZ4mainE19__PRETTY_FUNCTION__+0x20>
+  400e0a:       00 
+  400e0b:       0f 5c 0f                subps  (%rdi),%xmm1
+  400e0e:       0f 29 4c 24 e8          movaps %xmm1,-0x18(%rsp)
+  400e13:       f3 0f 10 4c 24 e8       movss  -0x18(%rsp),%xmm1
+  400e19:       0f 54 ca                andps  %xmm2,%xmm1
+  400e1c:       0f 2e c1                ucomiss %xmm1,%xmm0
+  400e1f:       76 1f                   jbe    400e40 <vfeq+0x40>
+  400e21:       f3 0f 10 4c 24 ec       movss  -0x14(%rsp),%xmm1
+  400e27:       0f 54 ca                andps  %xmm2,%xmm1
+  400e2a:       0f 2e c1                ucomiss %xmm1,%xmm0
+  400e2d:       76 11                   jbe    400e40 <vfeq+0x40>
+  400e2f:       f3 0f 10 4c 24 f0       movss  -0x10(%rsp),%xmm1
+  400e35:       0f 54 ca                andps  %xmm2,%xmm1
+  400e38:       0f 2e c1                ucomiss %xmm1,%xmm0
+  400e3b:       0f 97 c0                seta   %al
+  400e3e:       c3                      retq   
+  400e3f:       90                      nop
+  400e40:       31 c0                   xor    %eax,%eax
+  400e42:       c3                      retq
+
+  -ffast-math:
+
+  0000000000400d90 <vfeq>:
+  400d90:       0f 28 0e                movaps (%rsi),%xmm1
+  400d93:       f3 0f 10 15 25 05 00    movss  0x525(%rip),%xmm2        # 4012c0 <_ZZ4mainE19__PRETTY_FUNCTION__+0x20>
+  400d9a:       00 
+  400d9b:       0f 5c 0f                subps  (%rdi),%xmm1
+  400d9e:       0f 29 4c 24 e8          movaps %xmm1,-0x18(%rsp)
+  400da3:       f3 0f 10 4c 24 e8       movss  -0x18(%rsp),%xmm1
+  400da9:       0f 54 ca                andps  %xmm2,%xmm1
+  400dac:       0f 2f c1                comiss %xmm1,%xmm0
+  400daf:       76 1f                   jbe    400dd0 <vfeq+0x40>
+  400db1:       f3 0f 10 4c 24 ec       movss  -0x14(%rsp),%xmm1
+  400db7:       0f 54 ca                andps  %xmm2,%xmm1
+  400dba:       0f 2f c1                comiss %xmm1,%xmm0
+  400dbd:       76 11                   jbe    400dd0 <vfeq+0x40>
+  400dbf:       f3 0f 10 4c 24 f0       movss  -0x10(%rsp),%xmm1
+  400dc5:       0f 54 ca                andps  %xmm2,%xmm1
+  400dc8:       0f 2f c1                comiss %xmm1,%xmm0
+  400dcb:       0f 97 c0                seta   %al
+  400dce:       c3                      retq   
+  400dcf:       90                      nop
+  400dd0:       31 c0                   xor    %eax,%eax
+  400dd2:       c3                      retq
+
+  Replace abs with square:
+
+  0000000000400dc0 <vfeq>:
+  400dc0:       0f 28 0e                movaps (%rsi),%xmm1
+  400dc3:       f3 0f 59 c0             mulss  %xmm0,%xmm0
+  400dc7:       0f 5c 0f                subps  (%rdi),%xmm1
+  400dca:       0f 59 c9                mulps  %xmm1,%xmm1
+  400dcd:       0f 29 4c 24 e8          movaps %xmm1,-0x18(%rsp)
+  400dd2:       0f 2f 44 24 e8          comiss -0x18(%rsp),%xmm0
+  400dd7:       76 17                   jbe    400df0 <vfeq+0x30>
+  400dd9:       0f 2f 44 24 ec          comiss -0x14(%rsp),%xmm0
+  400dde:       76 10                   jbe    400df0 <vfeq+0x30>
+  400de0:       0f 2f 44 24 f0          comiss -0x10(%rsp),%xmm0
+  400de5:       0f 97 c0                seta   %al
+  400de8:       c3                      retq
+
+  Remove delta parameter:
+  
+  0000000000400d90 <vfeq>:
+  400d90:       0f 28 06                movaps (%rsi),%xmm0
+  400d93:       0f 5c 07                subps  (%rdi),%xmm0
+  400d96:       0f 59 c0                mulps  %xmm0,%xmm0
+  400d99:       0f 29 44 24 e8          movaps %xmm0,-0x18(%rsp)
+  400d9e:       f3 0f 10 44 24 e8       movss  -0x18(%rsp),%xmm0
+  400da4:       0f 2f 05 dd 04 00 00    comiss 0x4dd(%rip),%xmm0
+  400dab:       73 23                   jae    400dd0 <vfeq+0x40>
+  400dad:       f3 0f 10 44 24 ec       movss  -0x14(%rsp),%xmm0
+  400db3:       0f 2f 05 ce 04 00 00    comiss 0x4ce(%rip),%xmm0
+  400dba:       73 14                   jae    400dd0 <vfeq+0x40>
+  400dbc:       f3 0f 10 44 24 f0       movss  -0x10(%rsp),%xmm0
+  400dc2:       0f 2f 05 bf 04 00 00    comiss 0x4bf(%rip),%xmm0
+  400dc9:       0f 92 c0                setb   %al
+  400dcc:       c3                      retq
 */
 
-extern "C" bool vfeq(const VectorF& a, const VectorF& b, float d)
+extern "C" bool vfeq(const VectorF& a, const VectorF& b)
 {
-   return a.approx_equal(b, d);
+   return a == b;
 }
 
 int main(int argc, char **argv)
@@ -175,8 +382,8 @@ int main(int argc, char **argv)
 
    cout << c << endl;
 
-   assert(!vfeq(a, b, 0.1f));
-   assert(vfeq(c, c, 0.1f));
+   assert(!vfeq(a, b));
+   assert(vfeq(c, c));
 
    vfnorm(a);
    assert(vflen(a) > 0.999f && vflen(a) < 1.001f);
-- 
2.39.2