xmm_quantize_sub.c 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. /*
  2. * MP3 quantization, intrinsics functions
  3. *
  4. * Copyright (c) 2005-2006 Gabriel Bouvigne
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Library General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Library General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Library General Public
  17. * License along with this library; if not, write to the
  18. * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19. * Boston, MA 02111-1307, USA.
  20. */
  21. #ifdef HAVE_CONFIG_H
  22. # include <config.h>
  23. #endif
  24. #include "lame.h"
  25. #include "machine.h"
  26. #include "encoder.h"
  27. #include "util.h"
  28. #include "lame_intrin.h"
  29. #ifdef HAVE_XMMINTRIN_H
  30. #include <xmmintrin.h>
  31. typedef union {
  32. int32_t _i_32[4]; /* unions are initialized by its first member */
  33. float _float[4];
  34. __m128 _m128;
  35. } vecfloat_union;
  36. #define TRI_SIZE (5-1) /* 1024 = 4**5 */
  37. static const FLOAT costab[TRI_SIZE * 2] = {
  38. 9.238795325112867e-01, 3.826834323650898e-01,
  39. 9.951847266721969e-01, 9.801714032956060e-02,
  40. 9.996988186962042e-01, 2.454122852291229e-02,
  41. 9.999811752826011e-01, 6.135884649154475e-03
  42. };
  43. /* make sure functions with SSE instructions maintain their own properly aligned stack */
  44. #if defined (__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 2)))
  45. #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
  46. #else
  47. #define SSE_FUNCTION
  48. #endif
  49. SSE_FUNCTION void
  50. init_xrpow_core_sse(gr_info * const cod_info, FLOAT xrpow[576], int upper, FLOAT * sum)
  51. {
  52. int i;
  53. float tmp_max = 0;
  54. float tmp_sum = 0;
  55. int upper4 = (upper / 4) * 4;
  56. int rest = upper-upper4;
  57. const vecfloat_union fabs_mask = {{ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }};
  58. const __m128 vec_fabs_mask = _mm_loadu_ps(&fabs_mask._float[0]);
  59. vecfloat_union vec_xrpow_max;
  60. vecfloat_union vec_sum;
  61. vecfloat_union vec_tmp;
  62. _mm_prefetch((char *) cod_info->xr, _MM_HINT_T0);
  63. _mm_prefetch((char *) xrpow, _MM_HINT_T0);
  64. vec_xrpow_max._m128 = _mm_set_ps1(0);
  65. vec_sum._m128 = _mm_set_ps1(0);
  66. for (i = 0; i < upper4; i += 4) {
  67. vec_tmp._m128 = _mm_loadu_ps(&(cod_info->xr[i])); /* load */
  68. vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */
  69. vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128);
  70. vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128)));
  71. vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */
  72. _mm_storeu_ps(&(xrpow[i]), vec_tmp._m128); /* store into xrpow[] */
  73. }
  74. vec_tmp._m128 = _mm_set_ps1(0);
  75. switch (rest) {
  76. case 3: vec_tmp._float[2] = cod_info->xr[upper4+2];
  77. case 2: vec_tmp._float[1] = cod_info->xr[upper4+1];
  78. case 1: vec_tmp._float[0] = cod_info->xr[upper4+0];
  79. vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */
  80. vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128);
  81. vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128)));
  82. vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */
  83. switch (rest) {
  84. case 3: xrpow[upper4+2] = vec_tmp._float[2];
  85. case 2: xrpow[upper4+1] = vec_tmp._float[1];
  86. case 1: xrpow[upper4+0] = vec_tmp._float[0];
  87. default:
  88. break;
  89. }
  90. default:
  91. break;
  92. }
  93. tmp_sum = vec_sum._float[0] + vec_sum._float[1] + vec_sum._float[2] + vec_sum._float[3];
  94. {
  95. float ma = vec_xrpow_max._float[0] > vec_xrpow_max._float[1]
  96. ? vec_xrpow_max._float[0] : vec_xrpow_max._float[1];
  97. float mb = vec_xrpow_max._float[2] > vec_xrpow_max._float[3]
  98. ? vec_xrpow_max._float[2] : vec_xrpow_max._float[3];
  99. tmp_max = ma > mb ? ma : mb;
  100. }
  101. cod_info->xrpow_max = tmp_max;
  102. *sum = tmp_sum;
  103. }
  104. SSE_FUNCTION static void
  105. store4(__m128 v, float* f0, float* f1, float* f2, float* f3)
  106. {
  107. vecfloat_union r;
  108. r._m128 = v;
  109. *f0 = r._float[0];
  110. *f1 = r._float[1];
  111. *f2 = r._float[2];
  112. *f3 = r._float[3];
  113. }
  114. SSE_FUNCTION void
  115. fht_SSE2(FLOAT * fz, int n)
  116. {
  117. const FLOAT *tri = costab;
  118. int k4;
  119. FLOAT *fi, *gi;
  120. FLOAT const *fn;
  121. n <<= 1; /* to get BLKSIZE, because of 3DNow! ASM routine */
  122. fn = fz + n;
  123. k4 = 4;
  124. do {
  125. FLOAT s1, c1;
  126. int i, k1, k2, k3, kx;
  127. kx = k4 >> 1;
  128. k1 = k4;
  129. k2 = k4 << 1;
  130. k3 = k2 + k1;
  131. k4 = k2 << 1;
  132. fi = fz;
  133. gi = fi + kx;
  134. do {
  135. FLOAT f0, f1, f2, f3;
  136. f1 = fi[0] - fi[k1];
  137. f0 = fi[0] + fi[k1];
  138. f3 = fi[k2] - fi[k3];
  139. f2 = fi[k2] + fi[k3];
  140. fi[k2] = f0 - f2;
  141. fi[0] = f0 + f2;
  142. fi[k3] = f1 - f3;
  143. fi[k1] = f1 + f3;
  144. f1 = gi[0] - gi[k1];
  145. f0 = gi[0] + gi[k1];
  146. f3 = SQRT2 * gi[k3];
  147. f2 = SQRT2 * gi[k2];
  148. gi[k2] = f0 - f2;
  149. gi[0] = f0 + f2;
  150. gi[k3] = f1 - f3;
  151. gi[k1] = f1 + f3;
  152. gi += k4;
  153. fi += k4;
  154. } while (fi < fn);
  155. c1 = tri[0];
  156. s1 = tri[1];
  157. for (i = 1; i < kx; i++) {
  158. __m128 v_s2;
  159. __m128 v_c2;
  160. __m128 v_c1;
  161. __m128 v_s1;
  162. FLOAT c2, s2, s1_2 = s1+s1;
  163. c2 = 1 - s1_2 * s1;
  164. s2 = s1_2 * c1;
  165. fi = fz + i;
  166. gi = fz + k1 - i;
  167. v_c1 = _mm_set_ps1(c1);
  168. v_s1 = _mm_set_ps1(s1);
  169. v_c2 = _mm_set_ps1(c2);
  170. v_s2 = _mm_set_ps1(s2);
  171. {
  172. static const vecfloat_union sign_mask = {{0x80000000,0,0,0}};
  173. v_c1 = _mm_xor_ps(sign_mask._m128, v_c1); /* v_c1 := {-c1, +c1, +c1, +c1} */
  174. }
  175. {
  176. static const vecfloat_union sign_mask = {{0,0x80000000,0,0}};
  177. v_s1 = _mm_xor_ps(sign_mask._m128, v_s1); /* v_s1 := {+s1, -s1, +s1, +s1} */
  178. }
  179. {
  180. static const vecfloat_union sign_mask = {{0,0,0x80000000,0x80000000}};
  181. v_c2 = _mm_xor_ps(sign_mask._m128, v_c2); /* v_c2 := {+c2, +c2, -c2, -c2} */
  182. }
  183. do {
  184. __m128 p, q, r;
  185. q = _mm_setr_ps(fi[k1], fi[k3], gi[k1], gi[k3]); /* Q := {fi_k1,fi_k3,gi_k1,gi_k3}*/
  186. p = _mm_mul_ps(_mm_set_ps1(s2), q); /* P := s2 * Q */
  187. q = _mm_mul_ps(v_c2, q); /* Q := c2 * Q */
  188. q = _mm_shuffle_ps(q, q, _MM_SHUFFLE(1,0,3,2)); /* Q := {-c2*gi_k1,-c2*gi_k3,c2*fi_k1,c2*fi_k3} */
  189. p = _mm_add_ps(p, q);
  190. r = _mm_setr_ps(gi[0], gi[k2], fi[0], fi[k2]); /* R := {gi_0,gi_k2,fi_0,fi_k2} */
  191. q = _mm_sub_ps(r, p); /* Q := {gi_0-p0,gi_k2-p1,fi_0-p2,fi_k2-p3} */
  192. r = _mm_add_ps(r, p); /* R := {gi_0+p0,gi_k2+p1,fi_0+p2,fi_k2+p3} */
  193. p = _mm_shuffle_ps(q, r, _MM_SHUFFLE(2,0,2,0)); /* P := {q0,q2,r0,r2} */
  194. p = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3,1,2,0)); /* P := {q0,r0,q2,r2} */
  195. q = _mm_shuffle_ps(q, r, _MM_SHUFFLE(3,1,3,1)); /* Q := {q1,q3,r1,r3} */
  196. r = _mm_mul_ps(v_c1, q);
  197. q = _mm_mul_ps(v_s1, q);
  198. q = _mm_shuffle_ps(q, q, _MM_SHUFFLE(0,1,2,3)); /* Q := {q3,q2,q1,q0} */
  199. q = _mm_add_ps(q, r);
  200. store4(_mm_sub_ps(p, q), &gi[k3], &gi[k2], &fi[k3], &fi[k2]);
  201. store4(_mm_add_ps(p, q), &gi[k1], &gi[ 0], &fi[k1], &fi[ 0]);
  202. gi += k4;
  203. fi += k4;
  204. } while (fi < fn);
  205. c2 = c1;
  206. c1 = c2 * tri[0] - s1 * tri[1];
  207. s1 = c2 * tri[1] + s1 * tri[0];
  208. }
  209. tri += 2;
  210. } while (k4 < n);
  211. }
  212. #endif /* HAVE_XMMINTRIN_H */