From 6c5503ce59068506f34d719cefe91edc399d717a Mon Sep 17 00:00:00 2001 From: Nick Gasson Date: Sat, 15 Jan 2011 14:41:15 +0000 Subject: [PATCH] Use packed vector representation --- CMakeLists.txt | 2 +- include/Maths.hpp | 149 +++++++++++++++++++--------- src/Editor.cpp | 27 ++--- src/Map.cpp | 2 +- src/Mesh.cpp | 2 +- src/Points.cpp | 35 +++---- tools/MathsTest.cpp | 237 +++++++++++++++++++++++++++++++++++++++++--- 7 files changed, 361 insertions(+), 93 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e147078..8c246c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,7 +42,7 @@ include_directories (include ${FREETYPE_INCLUDE_DIRS}) include_directories (include ${CMAKE_CURRENT_BINARY_DIR}) if (NOT WIN32) # Unix - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -ffast-math") # "-Wconversion -Werror" if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "i686") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2 -mfpmath=sse") diff --git a/include/Maths.hpp b/include/Maths.hpp index 0d425ba..0f30f76 100644 --- a/include/Maths.hpp +++ b/include/Maths.hpp @@ -24,28 +24,75 @@ #include #include -#if 0 template -union Packed; +struct Packed; template <> -union Packed { - int __attribute__((mode(V4SF))) packed; - float unpacked[4]; +struct Packed { + typedef float __attribute__((vector_size (16))) Type; }; template <> -union Packed { - int __attribute__((mode(V3SF))) packed; - float unpacked[3]; +struct Packed { + typedef float __attribute__((vector_size (16))) Type; + + static inline Type pack(float a, float b, float c) + { + union { + Type p; + float f[4]; + } u; + u.f[0] = a; + u.f[1] = b; + u.f[2] = c; + u.f[3] = 0.0f; + return u.p; + } +}; + +template <> +struct Packed { + typedef int __attribute__((vector_size (16))) Type; + + static inline Type pack(int a, int b, int c) + { + union Union { + Type p; + int i[4]; + } u; + u.i[0] = a; + u.i[1] = b; + u.i[2] = c; + u.i[3] = 0; + return u.p; + } +}; + +template +struct EqTolerance; + +template <> +struct EqTolerance { + static const float Value = 0.01f; +}; + +template <> +struct EqTolerance { + static const int Value = 0; }; -#endif // A generic 3D vector template struct Vector { - inline Vector(T x, T y, T z) : x(x), y(y), z(z) {} - inline Vector() : x(0), y(0), z(0) {} + inline Vector(T x = 0, T y = 0, T z = 0) + { + packed = Packed::pack(x, y, z); + } + + inline Vector(const typename Packed::Type& v) + { + packed = v; + } // Cross product inline Vector operator*(const Vector& v) const @@ -59,70 +106,85 @@ struct Vector { // Multiply by a scalar inline Vector operator*(T t) const { - return Vector(x*t, y*t, z*t); + //return Vector(comp.x*t, comp.y*t, comp.z*t); + return Vector(packed * Packed::pack(t, t, t)); } // Divide by a scalar inline Vector operator/(T t) const { - return Vector(x/t, y/t, z/t); + //return Vector(comp.x/t, comp.y/t, comp.z/t); + return Vector(packed / Packed::pack(t, t, t)); } // Scalar product inline T dot(const Vector&v) const { - return x*v.x + y*v.y + z*v.z; + //return comp.x*v.comp.x + comp.y*v.comp.y + comp.z*v.comp.z; + const Vector tmp = packed * v.packed; + return tmp.x + tmp.y + tmp.z; } // Magnitude inline T length() const { - return sqrt(x*x + y*y + z*z); + const float prod = dot(*this); + return sqrt(prod); } inline Vector& normalise() { - T m = length(); - x /= m; - y /= m; - z /= m; + const T m = length(); + //comp.x /= m; + //comp.y /= m; + //comp.z /= m; + packed /= Packed::pack(m, m, m); return *this; } - + inline Vector operator+(const Vector& v) const { - return Vector(x+v.x, y+v.y, z+v.z); + return Vector(packed + v.packed); } - + inline Vector& operator+=(const Vector& v) { - x += v.x; - y += v.y; - z += v.z; + packed += v.packed; return *this; } inline Vector operator-(const Vector& v) const { - return Vector(x-v.x, y-v.y, z-v.z); + //return Vector(comp.x-v.comp.x, comp.y-v.comp.y, comp.z-v.comp.z); + return Vector(packed - v.packed); } inline Vector operator-() const { - return Vector(-x, -y, -z); + //return Vector(-comp.x, -comp.y, -comp.z); + return Vector(-packed); } inline Vector& operator-=(const Vector& v) { - x -= v.x; - y -= v.y; - z -= v.z; + packed -= v.packed; return *this; } - inline bool operator==(const Vector& v) const + inline bool operator==(const Vector& rhs) const { - return x == v.x && y == v.y && z == v.z; + //return (abs(rhs.comp.x - comp.x) < delta) + // && (abs(rhs.comp.y - comp.y) < delta) + // && (abs(rhs.comp.z - comp.z) < delta); + + const typename Packed::Type diff = rhs.packed - packed; + const T delta2 = EqTolerance::Value * EqTolerance::Value; + + const Vector squared = diff * diff; + + return (squared.x <= delta2) + && (squared.y <= delta2) + && (squared.z <= delta2); } inline bool operator!=(const Vector& v) const @@ -137,22 +199,21 @@ struct Vector { && (y < rhs.y || (y == rhs.y && z < rhs.z))); } - - bool approx_equal(const Vector& rhs, T delta) const - { - return (abs(rhs.x - x) < delta) - && (abs(rhs.y - y) < delta) - && (abs(rhs.z - z) < delta); - } - - T x, y, z; + + union { + typename Packed::Type packed; + struct { + T x, y, z; + }; + }; }; +typedef Vector VectorF; + template std::ostream& operator<<(std::ostream& s, const Vector& v) { - return s << "[" << v.x << " " << v.y - << " " << v.z << "]"; + return s << "[" << v.x << " " << v.y << " " << v.z << "]"; } template @@ -161,8 +222,6 @@ inline Vector make_vector(T x, T y, T z) return Vector(x, y, z); } -typedef Vector VectorF; - // Find a surface normal template Vector surface_normal(const Vector& a, const Vector& b, diff --git a/src/Editor.cpp b/src/Editor.cpp index ba5e51e..9db2ef6 100644 --- a/src/Editor.cpp +++ b/src/Editor.cpp @@ -249,11 +249,11 @@ bool Editor::draw_track_tile(Point where, track::Direction axis) } else { bool level; - Vector slope = map->slope_at(where, axis, level); + const VectorF slope = map->slope_at(where, axis, level); bool b_valid, a_valid; - Vector slope_before = map->slope_before(where, axis, b_valid); - Vector slope_after = map->slope_after(where, axis, a_valid); + const VectorF slope_before = map->slope_before(where, axis, b_valid); + const VectorF slope_after = map->slope_after(where, axis, a_valid); if (level) { const bool flat = @@ -275,7 +275,8 @@ bool Editor::draw_track_tile(Point where, track::Direction axis) << " before=" << slope_before << " after=" << slope_after; - map->set_track_at(where, + map->set_track_at( + where, make_slope_track(axis, slope, slope_before, slope_after)); return true; @@ -297,7 +298,7 @@ void Editor::draw_dragged_straight(const track::Direction& an_axis, int a_length for (int i = 0; i < a_length; i++) { draw_track_tile(where, an_axis); - + where.x += an_axis.x; where.y += an_axis.z; } @@ -561,12 +562,12 @@ void Editor::on_mouse_move(IPickBufferPtr pick_buffer, int x, int y, } else if (am_scrolling) { const float speed = 0.05f; - - my_position.x -= static_cast(xrel) * speed; - my_position.z -= static_cast(xrel) * speed; - - my_position.x += static_cast(yrel) * speed; - my_position.z -= static_cast(yrel) * speed; + + const VectorF xrelv(-xrel * speed, 0.0f, -xrel * speed); + const VectorF yrelv(yrel * speed, 0.0f, -yrel * speed); + + my_position += xrelv; + my_position += yrelv; } } @@ -598,10 +599,10 @@ void Editor::on_mouse_click(IPickBufferPtr pick_buffer, int x, int y, } } else if (a_button == MOUSE_WHEEL_UP) { - my_position.y -= 0.5f; + my_position -= VectorF(0.0f, 0.5f, 0.0f); } else if (a_button == MOUSE_WHEEL_DOWN) { - my_position.y += 0.5f; + my_position += VectorF(0.0f, 0.5f, 0.0f); } } diff --git a/src/Map.cpp b/src/Map.cpp index d302e1b..e1d8113 100644 --- a/src/Map.cpp +++ b/src/Map.cpp @@ -1168,7 +1168,7 @@ Vector Map::slope_at(Point where, v2 = height_map[indexes[1]].pos - height_map[indexes[2]].pos; } - level = v1.approx_equal(v2, 0.001f); + level = (v1 == v2); #if 0 debug() << "slope_at where=" << where diff --git a/src/Mesh.cpp b/src/Mesh.cpp index da4d854..3901121 100644 --- a/src/Mesh.cpp +++ b/src/Mesh.cpp @@ -67,7 +67,7 @@ struct MeshBuffer : IMeshBuffer { static bool merge_vector(const Vector& v1, const Vector& v2) { - return v1.approx_equal(v2, 0.001f); + return v1 == v2; } vector vertices; diff --git a/src/Points.cpp b/src/Points.cpp index 5d3924f..947e01d 100644 --- a/src/Points.cpp +++ b/src/Points.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2009-2010 Nick Gasson +// Copyright (C) 2009-2011 Nick Gasson // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -21,6 +21,7 @@ #include "ILogger.hpp" #include "BezierCurve.hpp" #include "Matrix.hpp" +#include "OpenGLHelper.hpp" #include @@ -128,26 +129,26 @@ void Points::render_arrow() const for (float t = 0.0f; t < arrow_len; t += step) { - const Vector v1 = curve(t); - const Vector v2 = curve(t + step); - + const VectorF v1 = curve(t); + const VectorF v2 = curve(t + step); + if (t >= arrow_len - step) { // Arrow head glBegin(GL_TRIANGLES); { - glVertex3f(v1.x, 0.0f, v1.z - head_width); - glVertex3f(v2.x, 0.0f, v2.z); - glVertex3f(v1.x, 0.0f, v1.z + head_width); + gl::vertex(make_vector(v1.x, 0.0f, v1.z - head_width)); + gl::vertex(make_vector(v2.x, 0.0f, v2.z)); + gl::vertex(make_vector(v1.x, 0.0f, v1.z + head_width)); } glEnd(); } else { glBegin(GL_QUADS); { - glVertex3f(v1.x, 0.0f, v1.z - 0.1f); - glVertex3f(v1.x, 0.0f, v1.z + 0.1f); - glVertex3f(v2.x, 0.0f, v2.z + 0.1f); - glVertex3f(v2.x, 0.0f, v2.z - 0.1f); + gl::vertex(make_vector(v1.x, 0.0f, v1.z - 0.1f)); + gl::vertex(make_vector(v1.x, 0.0f, v1.z + 0.1f)); + gl::vertex(make_vector(v2.x, 0.0f, v2.z + 0.1f)); + gl::vertex(make_vector(v2.x, 0.0f, v2.z - 0.1f)); } glEnd(); } @@ -217,11 +218,11 @@ void Points::merge(IMeshBufferPtr buf) const // Draw the curved sleepers for (float i = 0.25f; i < 1.0f; i += 0.08f) { - Vector v = (reflected ? my_reflected_curve : my_curve)(i); + const VectorF v = (reflected ? my_reflected_curve : my_curve)(i); - Vector t = make_vector(v.x - 0.5f, 0.0f, v.z); - Vector soff = off + rotateY(t, y_angle); - const Vector deriv = + const VectorF t = make_vector(v.x - 0.5f, 0.0f, v.z); + const VectorF soff = off + rotateY(t, y_angle); + const VectorF deriv = (reflected ? my_reflected_curve : my_curve).deriv(i); const float angle = rad_to_deg(atanf(deriv.z / deriv.x)); @@ -354,11 +355,11 @@ void Points::transform(const track::TravelToken& a_token, float delta) const bool backwards = a_token.position == displaced_endpoint(); const float f_value = backwards ? 1.0f - curve_delta : curve_delta; - const Vector curve_value = my_curve(f_value); + const VectorF curve_value = my_curve(f_value); // Calculate the angle that the tangent to the curve at this // point makes to (one of) the axis at this point - const Vector deriv = my_curve.deriv(f_value); + const VectorF deriv = my_curve.deriv(f_value); const float angle = rad_to_deg(atanf(deriv.z / deriv.x)); diff --git a/tools/MathsTest.cpp b/tools/MathsTest.cpp index e874a72..494c79f 100644 --- a/tools/MathsTest.cpp +++ b/tools/MathsTest.cpp @@ -1,6 +1,7 @@ #include "Maths.hpp" #include +#include /* Baseline: @@ -32,6 +33,28 @@ 400be8: f3 0f 7e 44 24 e8 movq -0x18(%rsp),%xmm0 400bee: c3 retq + With packed float vector inside union: + + 0000000000400bf0 : + 400bf0: 0f 28 07 movaps (%rdi),%xmm0 + 400bf3: 0f 58 06 addps (%rsi),%xmm0 + 400bf6: 0f 29 44 24 a8 movaps %xmm0,-0x58(%rsp) + 400bfb: 48 8b 44 24 a8 mov -0x58(%rsp),%rax + 400c00: 0f 29 44 24 d8 movaps %xmm0,-0x28(%rsp) + 400c05: 48 89 44 24 a0 mov %rax,-0x60(%rsp) + 400c0a: f3 0f 7e 4c 24 e0 movq -0x20(%rsp),%xmm1 + 400c10: f3 0f 7e 44 24 a0 movq -0x60(%rsp),%xmm0 + 400c16: c3 retq + + Without the union: + + 0000000000400bf0 : + 400bf0: 0f 28 07 movaps (%rdi),%xmm0 + 400bf3: 0f 58 06 addps (%rsi),%xmm0 + 400bf6: c3 retq + 400bf7: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1) + 400bfe: 00 00 + */ extern "C" VectorF vfadd(const VectorF& a, const VectorF& b) @@ -55,6 +78,16 @@ extern "C" VectorF vfadd(const VectorF& a, const VectorF& b) 400d49: f3 0f 7e 44 24 e8 movq -0x18(%rsp),%xmm0 400d4f: c3 retq + Packed vector: + + 0000000000400dc0 : + 400dc0: f3 0f 11 44 24 e8 movss %xmm0,-0x18(%rsp) + 400dc6: f3 0f 11 44 24 ec movss %xmm0,-0x14(%rsp) + 400dcc: f3 0f 11 44 24 f0 movss %xmm0,-0x10(%rsp) + 400dd2: 0f 28 07 movaps (%rdi),%xmm0 + 400dd5: 0f 5e 44 24 e8 divps -0x18(%rsp),%xmm0 + 400dda: c3 retq + */ extern "C" VectorF vfdiv(const VectorF& a, float f) @@ -75,6 +108,17 @@ extern "C" VectorF vfdiv(const VectorF& a, float f) 400d6b: f3 0f 59 4e 08 mulss 0x8(%rsi),%xmm1 400d70: f3 0f 58 c1 addss %xmm1,%xmm0 400d74: c3 retq + + Packed vector: + + 0000000000400de0 : + 400de0: 0f 28 07 movaps (%rdi),%xmm0 + 400de3: 0f 59 06 mulps (%rsi),%xmm0 + 400de6: 0f 29 44 24 e8 movaps %xmm0,-0x18(%rsp) + 400deb: f3 0f 10 44 24 e8 movss -0x18(%rsp),%xmm0 + 400df1: f3 0f 58 44 24 ec addss -0x14(%rsp),%xmm0 + 400df7: f3 0f 58 44 24 f0 addss -0x10(%rsp),%xmm0 + 400dfd: c3 retq */ @@ -86,16 +130,47 @@ extern "C" float vfdot(const VectorF& a, const VectorF& b) /* Baseline: - 0000000000400d50 : - 400d50: f3 0f 10 07 movss (%rdi),%xmm0 - 400d54: f3 0f 10 4f 04 movss 0x4(%rdi),%xmm1 - 400d59: f3 0f 59 06 mulss (%rsi),%xmm0 - 400d5d: f3 0f 59 4e 04 mulss 0x4(%rsi),%xmm1 - 400d62: f3 0f 58 c1 addss %xmm1,%xmm0 - 400d66: f3 0f 10 4f 08 movss 0x8(%rdi),%xmm1 - 400d6b: f3 0f 59 4e 08 mulss 0x8(%rsi),%xmm1 - 400d70: f3 0f 58 c1 addss %xmm1,%xmm0 - 400d74: c3 retq + 0000000000400ea0 : + 400ea0: f3 0f 10 07 movss (%rdi),%xmm0 + 400ea4: f3 0f 10 57 04 movss 0x4(%rdi),%xmm2 + 400ea9: f3 0f 59 c0 mulss %xmm0,%xmm0 + 400ead: f3 0f 59 d2 mulss %xmm2,%xmm2 + 400eb1: f3 0f 10 4f 08 movss 0x8(%rdi),%xmm1 + 400eb6: f3 0f 59 c9 mulss %xmm1,%xmm1 + 400eba: f3 0f 58 c2 addss %xmm2,%xmm0 + 400ebe: f3 0f 58 c1 addss %xmm1,%xmm0 + 400ec2: f3 0f 51 c0 sqrtss %xmm0,%xmm0 + 400ec6: c3 retq + + Packed vector: + + 0000000000400ee0 : + 400ee0: 48 83 ec 18 sub $0x18,%rsp + 400ee4: 0f 28 07 movaps (%rdi),%xmm0 + 400ee7: 0f 59 c0 mulps %xmm0,%xmm0 + 400eea: 0f 29 04 24 movaps %xmm0,(%rsp) + 400eee: f3 0f 10 0c 24 movss (%rsp),%xmm1 + 400ef3: f3 0f 58 4c 24 04 addss 0x4(%rsp),%xmm1 + 400ef9: f3 0f 58 4c 24 08 addss 0x8(%rsp),%xmm1 + 400eff: f3 0f 51 c1 sqrtss %xmm1,%xmm0 + 400f03: 0f 2e c0 ucomiss %xmm0,%xmm0 + 400f06: 7a 02 jp 400f0a + 400f08: 74 08 je 400f12 + 400f0a: 0f 28 c1 movaps %xmm1,%xmm0 + 400f0d: e8 96 fd ff ff callq 400ca8 + 400f12: 48 83 c4 18 add $0x18,%rsp + 400f16: c3 retq + + -ffast-math: + + 400e50: 0f 28 07 movaps (%rdi),%xmm0 + 400e53: 0f 59 c0 mulps %xmm0,%xmm0 + 400e56: 0f 29 44 24 e8 movaps %xmm0,-0x18(%rsp) + 400e5b: f3 0f 10 44 24 ec movss -0x14(%rsp),%xmm0 + 400e61: f3 0f 58 44 24 e8 addss -0x18(%rsp),%xmm0 + 400e67: f3 0f 58 44 24 f0 addss -0x10(%rsp),%xmm0 + 400e6d: f3 0f 51 c0 sqrtss %xmm0,%xmm0 + 400e71: c3 retq */ @@ -126,7 +201,55 @@ extern "C" float vflen(const VectorF& a) 400e3b: f3 0f 5e c8 divss %xmm0,%xmm1 400e3f: f3 0f 11 57 04 movss %xmm2,0x4(%rdi) 400e44: f3 0f 11 4f 08 movss %xmm1,0x8(%rdi) - 400e49: c3 retq + 400e49: c3 retq + + Packed vector: + + 0000000000400e80 : + 400e80: 53 push %rbx + 400e81: 48 89 fb mov %rdi,%rbx + 400e84: 48 83 ec 20 sub $0x20,%rsp + 400e88: 0f 28 17 movaps (%rdi),%xmm2 + 400e8b: 0f 28 c2 movaps %xmm2,%xmm0 + 400e8e: 0f 59 c2 mulps %xmm2,%xmm0 + 400e91: 0f 29 04 24 movaps %xmm0,(%rsp) + 400e95: f3 0f 10 0c 24 movss (%rsp),%xmm1 + 400e9a: f3 0f 58 4c 24 04 addss 0x4(%rsp),%xmm1 + 400ea0: f3 0f 58 4c 24 08 addss 0x8(%rsp),%xmm1 + 400ea6: f3 0f 51 c1 sqrtss %xmm1,%xmm0 + 400eaa: 0f 2e c0 ucomiss %xmm0,%xmm0 + 400ead: 7a 02 jp 400eb1 + 400eaf: 74 0b je 400ebc + 400eb1: 0f 28 c1 movaps %xmm1,%xmm0 + 400eb4: e8 ef fd ff ff callq 400ca8 + 400eb9: 0f 28 13 movaps (%rbx),%xmm2 + 400ebc: f3 0f 11 44 24 10 movss %xmm0,0x10(%rsp) + 400ec2: f3 0f 11 44 24 14 movss %xmm0,0x14(%rsp) + 400ec8: f3 0f 11 44 24 18 movss %xmm0,0x18(%rsp) + 400ece: 0f 5e 54 24 10 divps 0x10(%rsp),%xmm2 + 400ed3: 0f 29 13 movaps %xmm2,(%rbx) + 400ed6: 48 83 c4 20 add $0x20,%rsp + 400eda: 5b pop %rbx + 400edb: c3 retq + 400edc: 0f 1f 40 00 nopl 0x0(%rax) + + -ffast-math: + + 0000000000400e10 : + 400e10: 0f 28 0f movaps (%rdi),%xmm1 + 400e13: 0f 28 c1 movaps %xmm1,%xmm0 + 400e16: 0f 59 c1 mulps %xmm1,%xmm0 + 400e19: 0f 29 44 24 d8 movaps %xmm0,-0x28(%rsp) + 400e1e: f3 0f 10 44 24 dc movss -0x24(%rsp),%xmm0 + 400e24: f3 0f 58 44 24 d8 addss -0x28(%rsp),%xmm0 + 400e2a: f3 0f 58 44 24 e0 addss -0x20(%rsp),%xmm0 + 400e30: f3 0f 51 c0 sqrtss %xmm0,%xmm0 + 400e34: f3 0f 11 44 24 e8 movss %xmm0,-0x18(%rsp) + 400e3a: f3 0f 11 44 24 ec movss %xmm0,-0x14(%rsp) + 400e40: f3 0f 11 44 24 f0 movss %xmm0,-0x10(%rsp) + 400e46: 0f 5e 4c 24 e8 divps -0x18(%rsp),%xmm1 + 400e4b: 0f 29 0f movaps %xmm1,(%rdi) + 400e4e: c3 retq */ extern "C" void vfnorm(VectorF& a) @@ -159,11 +282,95 @@ extern "C" void vfnorm(VectorF& a) 400dbe: 66 90 xchg %ax,%ax 400dc0: 31 c0 xor %eax,%eax 400dc2: c3 retq + + Packed vector: + + 0000000000400e00 : + 400e00: 0f 28 0e movaps (%rsi),%xmm1 + 400e03: f3 0f 10 15 45 05 00 movss 0x545(%rip),%xmm2 # 401350 <_ZZ4mainE19__PRETTY_FUNCTION__+0x20> + 400e0a: 00 + 400e0b: 0f 5c 0f subps (%rdi),%xmm1 + 400e0e: 0f 29 4c 24 e8 movaps %xmm1,-0x18(%rsp) + 400e13: f3 0f 10 4c 24 e8 movss -0x18(%rsp),%xmm1 + 400e19: 0f 54 ca andps %xmm2,%xmm1 + 400e1c: 0f 2e c1 ucomiss %xmm1,%xmm0 + 400e1f: 76 1f jbe 400e40 + 400e21: f3 0f 10 4c 24 ec movss -0x14(%rsp),%xmm1 + 400e27: 0f 54 ca andps %xmm2,%xmm1 + 400e2a: 0f 2e c1 ucomiss %xmm1,%xmm0 + 400e2d: 76 11 jbe 400e40 + 400e2f: f3 0f 10 4c 24 f0 movss -0x10(%rsp),%xmm1 + 400e35: 0f 54 ca andps %xmm2,%xmm1 + 400e38: 0f 2e c1 ucomiss %xmm1,%xmm0 + 400e3b: 0f 97 c0 seta %al + 400e3e: c3 retq + 400e3f: 90 nop + 400e40: 31 c0 xor %eax,%eax + 400e42: c3 retq + + -ffast-math: + + 0000000000400d90 : + 400d90: 0f 28 0e movaps (%rsi),%xmm1 + 400d93: f3 0f 10 15 25 05 00 movss 0x525(%rip),%xmm2 # 4012c0 <_ZZ4mainE19__PRETTY_FUNCTION__+0x20> + 400d9a: 00 + 400d9b: 0f 5c 0f subps (%rdi),%xmm1 + 400d9e: 0f 29 4c 24 e8 movaps %xmm1,-0x18(%rsp) + 400da3: f3 0f 10 4c 24 e8 movss -0x18(%rsp),%xmm1 + 400da9: 0f 54 ca andps %xmm2,%xmm1 + 400dac: 0f 2f c1 comiss %xmm1,%xmm0 + 400daf: 76 1f jbe 400dd0 + 400db1: f3 0f 10 4c 24 ec movss -0x14(%rsp),%xmm1 + 400db7: 0f 54 ca andps %xmm2,%xmm1 + 400dba: 0f 2f c1 comiss %xmm1,%xmm0 + 400dbd: 76 11 jbe 400dd0 + 400dbf: f3 0f 10 4c 24 f0 movss -0x10(%rsp),%xmm1 + 400dc5: 0f 54 ca andps %xmm2,%xmm1 + 400dc8: 0f 2f c1 comiss %xmm1,%xmm0 + 400dcb: 0f 97 c0 seta %al + 400dce: c3 retq + 400dcf: 90 nop + 400dd0: 31 c0 xor %eax,%eax + 400dd2: c3 retq + + Replace abs with square: + + 0000000000400dc0 : + 400dc0: 0f 28 0e movaps (%rsi),%xmm1 + 400dc3: f3 0f 59 c0 mulss %xmm0,%xmm0 + 400dc7: 0f 5c 0f subps (%rdi),%xmm1 + 400dca: 0f 59 c9 mulps %xmm1,%xmm1 + 400dcd: 0f 29 4c 24 e8 movaps %xmm1,-0x18(%rsp) + 400dd2: 0f 2f 44 24 e8 comiss -0x18(%rsp),%xmm0 + 400dd7: 76 17 jbe 400df0 + 400dd9: 0f 2f 44 24 ec comiss -0x14(%rsp),%xmm0 + 400dde: 76 10 jbe 400df0 + 400de0: 0f 2f 44 24 f0 comiss -0x10(%rsp),%xmm0 + 400de5: 0f 97 c0 seta %al + 400de8: c3 retq + + Remove delta parameter: + + 0000000000400d90 : + 400d90: 0f 28 06 movaps (%rsi),%xmm0 + 400d93: 0f 5c 07 subps (%rdi),%xmm0 + 400d96: 0f 59 c0 mulps %xmm0,%xmm0 + 400d99: 0f 29 44 24 e8 movaps %xmm0,-0x18(%rsp) + 400d9e: f3 0f 10 44 24 e8 movss -0x18(%rsp),%xmm0 + 400da4: 0f 2f 05 dd 04 00 00 comiss 0x4dd(%rip),%xmm0 + 400dab: 73 23 jae 400dd0 + 400dad: f3 0f 10 44 24 ec movss -0x14(%rsp),%xmm0 + 400db3: 0f 2f 05 ce 04 00 00 comiss 0x4ce(%rip),%xmm0 + 400dba: 73 14 jae 400dd0 + 400dbc: f3 0f 10 44 24 f0 movss -0x10(%rsp),%xmm0 + 400dc2: 0f 2f 05 bf 04 00 00 comiss 0x4bf(%rip),%xmm0 + 400dc9: 0f 92 c0 setb %al + 400dcc: c3 retq */ -extern "C" bool vfeq(const VectorF& a, const VectorF& b, float d) +extern "C" bool vfeq(const VectorF& a, const VectorF& b) { - return a.approx_equal(b, d); + return a == b; } int main(int argc, char **argv) @@ -175,8 +382,8 @@ int main(int argc, char **argv) cout << c << endl; - assert(!vfeq(a, b, 0.1f)); - assert(vfeq(c, c, 0.1f)); + assert(!vfeq(a, b)); + assert(vfeq(c, c)); vfnorm(a); assert(vflen(a) > 0.999f && vflen(a) < 1.001f); -- 2.39.2