Hi, On Apr 26 20:46:51, b...@comstyle.com wrote: > Implement SSE2 lrint() and lrintf() on amd64.
I don't think this is worth the added complexity: seven more patches to have a different lrint()? Does it make the resampling noticably better/faster? Also, the patch changes the CONFIGURE_STYLE from gnu to autoreconf and hardwires the autoconf and automake version (without explicitly depending on them) - presumably because configure.ac is patched so ./configure must be recreated. This seems to basicaly replicate the SSE2 github commit(s) in the port. I would wait for a release that will already contain this. Jan > > Index: Makefile > =================================================================== > RCS file: /cvs/ports/audio/libsamplerate/Makefile,v > retrieving revision 1.27 > diff -u -p -u -p -r1.27 Makefile > --- Makefile 5 Sep 2023 16:13:38 -0000 1.27 > +++ Makefile 27 Apr 2024 00:26:05 -0000 > @@ -2,7 +2,7 @@ COMMENT= audio sample rate conversion li > > VER= 0.2.2 > DISTNAME= libsamplerate-${VER} > -REVISION= 0 > +REVISION= 1 > CATEGORIES= audio > EXTRACT_SUFX= .tar.xz > > @@ -18,7 +18,9 @@ SITES= https://github.com/libsndfile/lib > > WANTLIB= m > > -CONFIGURE_STYLE=gnu > +AUTOCONF_VERSION= 2.71 > +AUTOMAKE_VERSION= 1.16 > +CONFIGURE_STYLE=autoreconf > CONFIGURE_ARGS= --disable-cpu-clip \ > --disable-fftw \ > --disable-sndfile > Index: patches/patch-configure_ac > =================================================================== > RCS file: patches/patch-configure_ac > diff -N patches/patch-configure_ac > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ patches/patch-configure_ac 27 Apr 2024 00:26:05 -0000 > @@ -0,0 +1,43 @@ > +- Implement SSE2 lrint() and lrintf() > + 7a81766b14fa03e97822cf1e0b1651648df13116 > +- use sse2 intrinsics for lrint/lrintf only on windows x64 > + c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9 > +- sse2 lrint/lrintf updates > + c164eaa25ffdeedc7d25e731172cc45a25f483d4 > + > +Index: configure.ac > +--- configure.ac.orig > ++++ configure.ac > +@@ -89,7 +89,7 @@ m4_define([abi_version_patch], [lt_revision]) > + > + dnl > ==================================================================================== > + > +-AC_CHECK_HEADERS([stdbool.h stdint.h sys/times.h unistd.h]) > ++AC_CHECK_HEADERS([stdbool.h stdint.h sys/times.h unistd.h immintrin.h]) > + > + dnl > ==================================================================================== > + dnl Couple of initializations here. Fill in real values later. > +@@ -105,6 +105,9 @@ AC_ARG_ENABLE([werror], > + AC_ARG_ENABLE([cpu-clip], > + [AS_HELP_STRING([--disable-cpu-clip], [disable tricky cpu specific > clipper])]) > + > ++AC_ARG_ENABLE([sse2-lrint], > ++ [AS_HELP_STRING([--enable-sse2-lrint], [implement lrintf using SSE2 on > x86 CPUs if possible])]) > ++ > + AC_ARG_ENABLE([sndfile], > + [AS_HELP_STRING([--disable-sndfile], [disable support for sndfile > (default=autodetect)])], [], [enable_sndfile=auto]) > + > +@@ -178,6 +181,13 @@ AS_IF([test "x$enable_cpu_clip" != "xno"], [ > + > + AC_DEFINE_UNQUOTED([CPU_CLIPS_POSITIVE], [${ac_cv_c_clip_positive}], [Host > processor clips on positive float to int conversion.]) > + AC_DEFINE_UNQUOTED([CPU_CLIPS_NEGATIVE], [${ac_cv_c_clip_negative}], [Host > processor clips on negative float to int conversion.]) > ++ > ++dnl > ==================================================================================== > ++dnl Determine if the user enabled lrint implementations using SSE2. > ++ > ++AS_IF([test "x$enable_sse2_lrint" = "xyes"], [ > ++ CFLAGS="$CFLAGS -DENABLE_SSE2_LRINT" > ++ ]) > + > + dnl > ==================================================================================== > + dnl Check for libsndfile which is required for the test and example > programs. > Index: patches/patch-examples_audio_out_c > =================================================================== > RCS file: patches/patch-examples_audio_out_c > diff -N patches/patch-examples_audio_out_c > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ patches/patch-examples_audio_out_c 27 Apr 2024 00:26:05 -0000 > @@ -0,0 +1,19 @@ > +- Implement SSE2 lrint() and lrintf() > + 7a81766b14fa03e97822cf1e0b1651648df13116 > +- use sse2 intrinsics for lrint/lrintf only on windows x64 > + c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9 > +- sse2 lrint/lrintf updates > + c164eaa25ffdeedc7d25e731172cc45a25f483d4 > + > +Index: examples/audio_out.c > +--- examples/audio_out.c.orig > ++++ examples/audio_out.c > +@@ -960,7 +960,7 @@ solaris_play (get_audio_callback_t callback, AUDIO_OUT > + > + while ((read_frames = callback (callback_data, float_buffer, BUFFER_LEN > / solaris_out->channels))) > + { for (k = 0 ; k < read_frames * solaris_out->channels ; k++) > +- buffer [k] = lrint (32767.0 * float_buffer [k]) ; > ++ buffer [k] = psf_lrint (32767.0 * float_buffer [k]) ; > + write (solaris_out->fd, buffer, read_frames * > solaris_out->channels * sizeof (short)) ; > + } ; > + > Index: patches/patch-src_common_h > =================================================================== > RCS file: patches/patch-src_common_h > diff -N patches/patch-src_common_h > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ patches/patch-src_common_h 27 Apr 2024 00:26:05 -0000 > @@ -0,0 +1,98 @@ > +- Implement SSE2 lrint() and lrintf() > + 7a81766b14fa03e97822cf1e0b1651648df13116 > +- use sse2 intrinsics for lrint/lrintf only on windows x64 > + c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9 > +- sse2 lrint/lrintf updates > + c164eaa25ffdeedc7d25e731172cc45a25f483d4 > + > +Index: src/common.h > +--- src/common.h.orig > ++++ src/common.h > +@@ -14,6 +14,36 @@ > + #include <stdbool.h> > + #endif > + > ++#if defined(__x86_64__) || defined(_M_X64) > ++# define HAVE_SSE2_INTRINSICS > ++#elif defined(ENABLE_SSE2_LRINT) && (defined(_M_IX86) || defined(__i386__)) > ++# if defined(_MSC_VER) > ++# define HAVE_SSE2_INTRINSICS > ++# elif defined(__clang__) > ++# ifdef __SSE2__ > ++# define HAVE_SSE2_INTRINSICS > ++# elif (__has_attribute(target)) > ++# define HAVE_SSE2_INTRINSICS > ++# define USE_TARGET_ATTRIBUTE > ++# endif > ++# elif defined(__GNUC__) > ++# ifdef __SSE2__ > ++# define HAVE_SSE2_INTRINSICS > ++# elif (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) > ++# define HAVE_SSE2_INTRINSICS > ++# define USE_TARGET_ATTRIBUTE > ++# endif > ++# endif > ++#endif > ++ > ++#ifdef HAVE_SSE2_INTRINSICS > ++#ifdef HAVE_IMMINTRIN_H > ++#include <immintrin.h> > ++#else > ++#include <emmintrin.h> > ++#endif > ++#endif /* HAVE_SSE2_INTRINSICS */ > ++ > + #include <math.h> > + > + #ifdef HAVE_VISIBILITY > +@@ -163,6 +193,41 @@ const char* zoh_get_description (int src_enum) ; > + SRC_STATE *zoh_state_new (int channels, SRC_ERROR *error) ; > + > + /*---------------------------------------------------------- > ++** SIMD optimized math functions. > ++*/ > ++ > ++#ifdef HAVE_SSE2_INTRINSICS > ++static inline int > ++#ifdef USE_TARGET_ATTRIBUTE > ++__attribute__((target("sse2"))) > ++#endif > ++psf_lrintf (float x) > ++{ > ++ return _mm_cvtss_si32 (_mm_load_ss (&x)) ; > ++} > ++static inline int > ++#ifdef USE_TARGET_ATTRIBUTE > ++__attribute__((target("sse2"))) > ++#endif > ++psf_lrint (double x) > ++{ > ++ return _mm_cvtsd_si32 (_mm_load_sd (&x)) ; > ++} > ++ > ++#else > ++ > ++static inline int psf_lrintf (float x) > ++{ > ++ return lrintf (x) ; > ++} /* psf_lrintf */ > ++ > ++static inline int psf_lrint (double x) > ++{ > ++ return lrint (x) ; > ++} /* psf_lrint */ > ++#endif > ++ > ++/*---------------------------------------------------------- > + ** Common static inline functions. > + */ > + > +@@ -170,7 +235,7 @@ static inline double > + fmod_one (double x) > + { double res ; > + > +- res = x - lrint (x) ; > ++ res = x - psf_lrint (x) ; > + if (res < 0.0) > + return res + 1.0 ; > + > Index: patches/patch-src_samplerate_c > =================================================================== > RCS file: patches/patch-src_samplerate_c > diff -N patches/patch-src_samplerate_c > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ patches/patch-src_samplerate_c 27 Apr 2024 00:26:05 -0000 > @@ -0,0 +1,28 @@ > +- Implement SSE2 lrint() and lrintf() > + 7a81766b14fa03e97822cf1e0b1651648df13116 > +- use sse2 intrinsics for lrint/lrintf only on windows x64 > + c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9 > +- sse2 lrint/lrintf updates > + c164eaa25ffdeedc7d25e731172cc45a25f483d4 > + > +Index: src/samplerate.c > +--- src/samplerate.c.orig > ++++ src/samplerate.c > +@@ -445,7 +445,7 @@ src_float_to_short_array (const float *in, short *out, > + else if (scaled_value <= -32768.f) > + out [i] = -32768 ; > + else > +- out [i] = (short) (lrintf (scaled_value)) ; > ++ out [i] = (short) (psf_lrintf (scaled_value)) ; > + } > + } /* src_float_to_short_array */ > + > +@@ -477,7 +477,7 @@ src_float_to_int_array (const float *in, int *out, int > + continue ; > + } ; > + #endif > +- out [i] = (int) lrint (scaled_value) ; > ++ out [i] = (int) psf_lrint (scaled_value) ; > + } ; > + > + } /* src_float_to_int_array */ > Index: patches/patch-src_src_linear_c > =================================================================== > RCS file: patches/patch-src_src_linear_c > diff -N patches/patch-src_src_linear_c > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ patches/patch-src_src_linear_c 27 Apr 2024 00:26:05 -0000 > @@ -0,0 +1,28 @@ > +- Implement SSE2 lrint() and lrintf() > + 7a81766b14fa03e97822cf1e0b1651648df13116 > +- use sse2 intrinsics for lrint/lrintf only on windows x64 > + c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9 > +- sse2 lrint/lrintf updates > + c164eaa25ffdeedc7d25e731172cc45a25f483d4 > + > +Index: src/src_linear.c > +--- src/src_linear.c.orig > ++++ src/src_linear.c > +@@ -102,7 +102,7 @@ linear_vari_process (SRC_STATE *state, SRC_DATA *data) > + } ; > + > + rem = fmod_one (input_index) ; > +- priv->in_used += state->channels * lrint (input_index - rem) ; > ++ priv->in_used += state->channels * psf_lrint (input_index - rem) ; > + input_index = rem ; > + > + /* Main processing loop. */ > +@@ -128,7 +128,7 @@ linear_vari_process (SRC_STATE *state, SRC_DATA *data) > + input_index += 1.0 / src_ratio ; > + rem = fmod_one (input_index) ; > + > +- priv->in_used += state->channels * lrint (input_index - rem) ; > ++ priv->in_used += state->channels * psf_lrint (input_index - > rem) ; > + input_index = rem ; > + } ; > + > Index: patches/patch-src_src_sinc_c > =================================================================== > RCS file: patches/patch-src_src_sinc_c > diff -N patches/patch-src_src_sinc_c > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ patches/patch-src_src_sinc_c 27 Apr 2024 00:26:05 -0000 > @@ -0,0 +1,148 @@ > +- Implement SSE2 lrint() and lrintf() > + 7a81766b14fa03e97822cf1e0b1651648df13116 > +- use sse2 intrinsics for lrint/lrintf only on windows x64 > + c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9 > +- sse2 lrint/lrintf updates > + c164eaa25ffdeedc7d25e731172cc45a25f483d4 > + > +Index: src/src_sinc.c > +--- src/src_sinc.c.orig > ++++ src/src_sinc.c > +@@ -132,7 +132,7 @@ static SRC_STATE_VT sinc_mono_state_vt = > + > + static inline increment_t > + double_to_fp (double x) > +-{ return (increment_t) (lrint ((x) * FP_ONE)) ; > ++{ return (increment_t) (psf_lrint ((x) * FP_ONE)) ; > + } /* double_to_fp */ > + > + static inline increment_t > +@@ -240,7 +240,7 @@ sinc_filter_new (int converter_type, int channels) > + #endif > + } > + > +- priv->b_len = 3 * (int) lrint ((priv->coeff_half_len + 2.0) / > priv->index_inc * SRC_MAX_RATIO + 1) ; > ++ priv->b_len = 3 * (int) psf_lrint ((priv->coeff_half_len + 2.0) > / priv->index_inc * SRC_MAX_RATIO + 1) ; > + priv->b_len = MAX (priv->b_len, 4096) ; > + priv->b_len *= channels ; > + priv->b_len += 1 ; // There is a <= check against > samples_in_hand requiring a buffer bigger than the calculation above > +@@ -458,12 +458,12 @@ sinc_mono_vari_process (SRC_STATE *state, SRC_DATA *da > + count /= MIN (state->last_ratio, data->src_ratio) ; > + > + /* Maximum coefficientson either side of center point. */ > +- half_filter_chan_len = state->channels * (int) (lrint (count) + 1) ; > ++ half_filter_chan_len = state->channels * (int) (psf_lrint (count) + 1) ; > + > + input_index = state->last_position ; > + > + rem = fmod_one (input_index) ; > +- filter->b_current = (filter->b_current + state->channels * lrint > (input_index - rem)) % filter->b_len ; > ++ filter->b_current = (filter->b_current + state->channels * psf_lrint > (input_index - rem)) % filter->b_len ; > + input_index = rem ; > + > + terminate = 1.0 / src_ratio + 1e-20 ; > +@@ -505,7 +505,7 @@ sinc_mono_vari_process (SRC_STATE *state, SRC_DATA *da > + input_index += 1.0 / src_ratio ; > + rem = fmod_one (input_index) ; > + > +- filter->b_current = (filter->b_current + state->channels * > lrint (input_index - rem)) % filter->b_len ; > ++ filter->b_current = (filter->b_current + state->channels * > psf_lrint (input_index - rem)) % filter->b_len ; > + input_index = rem ; > + } ; > + > +@@ -614,12 +614,12 @@ sinc_stereo_vari_process (SRC_STATE *state, SRC_DATA * > + count /= MIN (state->last_ratio, data->src_ratio) ; > + > + /* Maximum coefficientson either side of center point. */ > +- half_filter_chan_len = state->channels * (int) (lrint (count) + 1) ; > ++ half_filter_chan_len = state->channels * (int) (psf_lrint (count) + 1) ; > + > + input_index = state->last_position ; > + > + rem = fmod_one (input_index) ; > +- filter->b_current = (filter->b_current + state->channels * lrint > (input_index - rem)) % filter->b_len ; > ++ filter->b_current = (filter->b_current + state->channels * psf_lrint > (input_index - rem)) % filter->b_len ; > + input_index = rem ; > + > + terminate = 1.0 / src_ratio + 1e-20 ; > +@@ -660,7 +660,7 @@ sinc_stereo_vari_process (SRC_STATE *state, SRC_DATA * > + input_index += 1.0 / src_ratio ; > + rem = fmod_one (input_index) ; > + > +- filter->b_current = (filter->b_current + state->channels * > lrint (input_index - rem)) % filter->b_len ; > ++ filter->b_current = (filter->b_current + state->channels * > psf_lrint (input_index - rem)) % filter->b_len ; > + input_index = rem ; > + } ; > + > +@@ -770,12 +770,12 @@ sinc_quad_vari_process (SRC_STATE *state, SRC_DATA *da > + count /= MIN (state->last_ratio, data->src_ratio) ; > + > + /* Maximum coefficientson either side of center point. */ > +- half_filter_chan_len = state->channels * (int) (lrint (count) + 1) ; > ++ half_filter_chan_len = state->channels * (int) (psf_lrint (count) + 1) ; > + > + input_index = state->last_position ; > + > + rem = fmod_one (input_index) ; > +- filter->b_current = (filter->b_current + state->channels * lrint > (input_index - rem)) % filter->b_len ; > ++ filter->b_current = (filter->b_current + state->channels * psf_lrint > (input_index - rem)) % filter->b_len ; > + input_index = rem ; > + > + terminate = 1.0 / src_ratio + 1e-20 ; > +@@ -816,7 +816,7 @@ sinc_quad_vari_process (SRC_STATE *state, SRC_DATA *da > + input_index += 1.0 / src_ratio ; > + rem = fmod_one (input_index) ; > + > +- filter->b_current = (filter->b_current + state->channels * > lrint (input_index - rem)) % filter->b_len ; > ++ filter->b_current = (filter->b_current + state->channels * > psf_lrint (input_index - rem)) % filter->b_len ; > + input_index = rem ; > + } ; > + > +@@ -925,12 +925,12 @@ sinc_hex_vari_process (SRC_STATE *state, SRC_DATA *dat > + count /= MIN (state->last_ratio, data->src_ratio) ; > + > + /* Maximum coefficientson either side of center point. */ > +- half_filter_chan_len = state->channels * (int) (lrint (count) + 1) ; > ++ half_filter_chan_len = state->channels * (int) (psf_lrint (count) + 1) ; > + > + input_index = state->last_position ; > + > + rem = fmod_one (input_index) ; > +- filter->b_current = (filter->b_current + state->channels * lrint > (input_index - rem)) % filter->b_len ; > ++ filter->b_current = (filter->b_current + state->channels * psf_lrint > (input_index - rem)) % filter->b_len ; > + input_index = rem ; > + > + terminate = 1.0 / src_ratio + 1e-20 ; > +@@ -971,7 +971,7 @@ sinc_hex_vari_process (SRC_STATE *state, SRC_DATA *dat > + input_index += 1.0 / src_ratio ; > + rem = fmod_one (input_index) ; > + > +- filter->b_current = (filter->b_current + state->channels * > lrint (input_index - rem)) % filter->b_len ; > ++ filter->b_current = (filter->b_current + state->channels * > psf_lrint (input_index - rem)) % filter->b_len ; > + input_index = rem ; > + } ; > + > +@@ -1090,12 +1090,12 @@ sinc_multichan_vari_process (SRC_STATE *state, > SRC_DAT > + count /= MIN (state->last_ratio, data->src_ratio) ; > + > + /* Maximum coefficientson either side of center point. */ > +- half_filter_chan_len = state->channels * (int) (lrint (count) + 1) ; > ++ half_filter_chan_len = state->channels * (int) (psf_lrint (count) + 1) ; > + > + input_index = state->last_position ; > + > + rem = fmod_one (input_index) ; > +- filter->b_current = (filter->b_current + state->channels * lrint > (input_index - rem)) % filter->b_len ; > ++ filter->b_current = (filter->b_current + state->channels * psf_lrint > (input_index - rem)) % filter->b_len ; > + input_index = rem ; > + > + terminate = 1.0 / src_ratio + 1e-20 ; > +@@ -1136,7 +1136,7 @@ sinc_multichan_vari_process (SRC_STATE *state, SRC_DAT > + input_index += 1.0 / src_ratio ; > + rem = fmod_one (input_index) ; > + > +- filter->b_current = (filter->b_current + state->channels * > lrint (input_index - rem)) % filter->b_len ; > ++ filter->b_current = (filter->b_current + state->channels * > psf_lrint (input_index - rem)) % filter->b_len ; > + input_index = rem ; > + } ; > + > Index: patches/patch-src_src_zoh_c > =================================================================== > RCS file: patches/patch-src_src_zoh_c > diff -N patches/patch-src_src_zoh_c > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ patches/patch-src_src_zoh_c 27 Apr 2024 00:26:05 -0000 > @@ -0,0 +1,28 @@ > +- Implement SSE2 lrint() and lrintf() > + 7a81766b14fa03e97822cf1e0b1651648df13116 > +- use sse2 intrinsics for lrint/lrintf only on windows x64 > + c01e2405612ad3561bf93e8e6dddb9ba0dffe4d9 > +- sse2 lrint/lrintf updates > + c164eaa25ffdeedc7d25e731172cc45a25f483d4 > + > +Index: src/src_zoh.c > +--- src/src_zoh.c.orig > ++++ src/src_zoh.c > +@@ -99,7 +99,7 @@ zoh_vari_process (SRC_STATE *state, SRC_DATA *data) > + } ; > + > + rem = fmod_one (input_index) ; > +- priv->in_used += state->channels * lrint (input_index - rem) ; > ++ priv->in_used += state->channels * psf_lrint (input_index - rem) ; > + input_index = rem ; > + > + /* Main processing loop. */ > +@@ -117,7 +117,7 @@ zoh_vari_process (SRC_STATE *state, SRC_DATA *data) > + input_index += 1.0 / src_ratio ; > + rem = fmod_one (input_index) ; > + > +- priv->in_used += state->channels * lrint (input_index - rem) ; > ++ priv->in_used += state->channels * psf_lrint (input_index - > rem) ; > + input_index = rem ; > + } ; > + > >