Re: [Xen-devel] [PATCH] x86/build: Use new .nops directive when available
>>> On 28.08.18 at 19:58, wrote: > On 17/08/18 13:45, Jan Beulich wrote: > On 15.08.18 at 19:57, wrote: >>> --- a/xen/arch/x86/alternative.c >>> +++ b/xen/arch/x86/alternative.c >>> @@ -84,6 +84,19 @@ static const unsigned char * const >>> p6_nops[ASM_NOP_MAX+1] > >>> init_or_livepatch_cons >>> >>> static const unsigned char * const *ideal_nops init_or_livepatch_data = >>> p6_nops; >>> >>> +#ifdef HAVE_AS_NOP_DIRECTIVE >>> + >>> +/* Nops in .init.rodata to compare against the runtime ideal nops. */ >>> +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t" >>> + "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t" >>> + ".popsection\n\t"); >> Any particular reason not to put them in .init.text? > > Because its data, not executable code. Well, depends. My view is that any insn is (potentially) executable code. In fact I like gas'es behavior on ARM where insns and data directives are separated by emitting "artificial" labels as markers. But I'm fine either way. Jan ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH] x86/build: Use new .nops directive when available
On 17/08/18 13:45, Jan Beulich wrote: On 15.08.18 at 19:57, wrote: >> --- a/xen/arch/x86/Rules.mk >> +++ b/xen/arch/x86/Rules.mk >> @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid >> (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID) >> $(call as-option-add,CFLAGS,CC,\ >> ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE) >> >> +# Check to see whether the assmbler supports the .nop directive. >> +$(call as-option-add,CFLAGS,CC,\ >> +".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE) > Can this please be HAVE_AS_NOPS_DIRECTIVE? Ok. > >> --- a/xen/arch/x86/alternative.c >> +++ b/xen/arch/x86/alternative.c >> @@ -84,6 +84,19 @@ static const unsigned char * const p6_nops[ASM_NOP_MAX+1] >> init_or_livepatch_cons >> >> static const unsigned char * const *ideal_nops init_or_livepatch_data = >> p6_nops; >> >> +#ifdef HAVE_AS_NOP_DIRECTIVE >> + >> +/* Nops in .init.rodata to compare against the runtime ideal nops. */ >> +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t" >> + "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t" >> + ".popsection\n\t"); > Any particular reason not to put them in .init.text? Because its data, not executable code. > >> @@ -27,6 +26,14 @@ extern void add_nops(void *insns, unsigned int len); >> extern void apply_alternatives(struct alt_instr *start, struct alt_instr >> *end); >> extern void alternative_instructions(void); >> >> +asm ( ".macro mknops nr_bytes\n\t" >> +#ifdef HAVE_AS_NOP_DIRECTIVE >> + ".nops \\nr_bytes, " __stringify(ASM_NOP_MAX) "\n\t" >> +#else >> + ".skip \\nr_bytes, 0x90\n\t" >> +#endif >> + ".endm\n\t" ); > Did you take a look at "x86/alternatives: allow using assembler macros > in favor of C ones" yet? Perhaps this redundant macro definition could > be avoided that way? I'm not going to exclude though that some more > work might be needed to get there. No - not looked at that yet. That does look like we can get a large net cleanup, and can avoid using always_inline to work around the same problem. > > Anyway, I'm in principle fine with the change, irrespective of the other > discussion we've had, so with the name change and possibly the other > two remarks taken care of > Acked-by: Jan Beulich Thanks, ~Andrew ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH] x86/build: Use new .nops directive when available
>>> On 15.08.18 at 19:57, wrote: > --- a/xen/arch/x86/Rules.mk > +++ b/xen/arch/x86/Rules.mk > @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid > (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID) > $(call as-option-add,CFLAGS,CC,\ > ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE) > > +# Check to see whether the assmbler supports the .nop directive. > +$(call as-option-add,CFLAGS,CC,\ > +".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE) Can this please be HAVE_AS_NOPS_DIRECTIVE? > --- a/xen/arch/x86/alternative.c > +++ b/xen/arch/x86/alternative.c > @@ -84,6 +84,19 @@ static const unsigned char * const p6_nops[ASM_NOP_MAX+1] > init_or_livepatch_cons > > static const unsigned char * const *ideal_nops init_or_livepatch_data = > p6_nops; > > +#ifdef HAVE_AS_NOP_DIRECTIVE > + > +/* Nops in .init.rodata to compare against the runtime ideal nops. */ > +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t" > + "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t" > + ".popsection\n\t"); Any particular reason not to put them in .init.text? > @@ -27,6 +26,14 @@ extern void add_nops(void *insns, unsigned int len); > extern void apply_alternatives(struct alt_instr *start, struct alt_instr > *end); > extern void alternative_instructions(void); > > +asm ( ".macro mknops nr_bytes\n\t" > +#ifdef HAVE_AS_NOP_DIRECTIVE > + ".nops \\nr_bytes, " __stringify(ASM_NOP_MAX) "\n\t" > +#else > + ".skip \\nr_bytes, 0x90\n\t" > +#endif > + ".endm\n\t" ); Did you take a look at "x86/alternatives: allow using assembler macros in favor of C ones" yet? Perhaps this redundant macro definition could be avoided that way? I'm not going to exclude though that some more work might be needed to get there. Anyway, I'm in principle fine with the change, irrespective of the other discussion we've had, so with the name change and possibly the other two remarks taken care of Acked-by: Jan Beulich Jan ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH] x86/build: Use new .nops directive when available
On Thu, Aug 16, 2018 at 04:56:04PM +0100, Andrew Cooper wrote: > On 16/08/18 15:31, Roger Pau Monné wrote: > > On Thu, Aug 16, 2018 at 11:42:56AM +0100, Andrew Cooper wrote: > >> On 16/08/18 10:55, Roger Pau Monné wrote: > >>> On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote: > diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk > index ac585a3..c84ed20 100644 > --- a/xen/arch/x86/Rules.mk > +++ b/xen/arch/x86/Rules.mk > @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid > (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID) > $(call as-option-add,CFLAGS,CC,\ > ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE) > > +# Check to see whether the assmbler supports the .nop directive. > +$(call as-option-add,CFLAGS,CC,\ > +".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE) > >>> I think I remember commenting on an earlier version of this about the > >>> usage of the CONTROL parameter. I would expect the assembler to > >>> use the most optimized version by default, is that not the case? > >> Again, I don't understand what you're trying to say. > >> > >> This expression is like this, because that's how we actually use it. > > As mentioned in another email, I was wondering why we choose to not > > use nop instructions > 9 bytes. The assembler will by default use > > nop instructions up to 11 bytes for 64bit code. > > Using more than 9 bytes is suboptimal on some hardware. OK. But using the same argument isn't it also suboptimal to use 9 bytes on some hardware then also? What I mean by this is that it would be good to add a comment somewhere that notes why nop instructions are limited to 9 bytes, because that's likely to generate the more optimized code on a wide variety of hardware. At least it would have helped me. > Toolchains use long nops (exclusively?) for basic block alignment, > whereas we use use them for executing through because its still faster > than a branch. > > > > + > CFLAGS += -mno-red-zone -fpic -fno-asynchronous-unwind-tables > > # Xen doesn't use SSE interally. If the compiler supports it, also > skip the > diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c > index 0ef7a8b..2c844d6 100644 > --- a/xen/arch/x86/alternative.c > +++ b/xen/arch/x86/alternative.c > @@ -84,6 +84,19 @@ static const unsigned char * const > p6_nops[ASM_NOP_MAX+1] init_or_livepatch_cons > > static const unsigned char * const *ideal_nops init_or_livepatch_data = > p6_nops; > > +#ifdef HAVE_AS_NOP_DIRECTIVE > + > +/* Nops in .init.rodata to compare against the runtime ideal nops. */ > +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t" > + "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t" > + ".popsection\n\t"); > +extern char toolchain_nops[ASM_NOP_MAX]; > +static bool __read_mostly toolchain_nops_are_ideal; > + > +#else > +# define toolchain_nops_are_ideal false > +#endif > + > static void __init arch_init_ideal_nops(void) > { > switch ( boot_cpu_data.x86_vendor ) > @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void) > ideal_nops = k8_nops; > break; > } > + > +#ifdef HAVE_AS_NOP_DIRECTIVE > +if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) > == 0 ) > +toolchain_nops_are_ideal = true; > +#endif > >>> You are only comparing that the biggest nop instruction (9 bytes > >>> AFAICT) generated by the assembler is what Xen believes to be the more > >>> optimized version. What about shorter nops? > >> They are all variations on a theme. > >> > >> For P6 nops, its the 0f 1f root which is important, which takes a modrm > >> byte. Traditionally, its always encoded with eax and uses redundant > >> memory encodings for longer instructions. > >> > >> I can't think of any way of detecting if the optimised nops if the > >> toolchain starts using alternative registers in the encoding, but I > >> expect this case won't happen in practice. > > I would rather do: > > > > toolchain_nops: > > .nops 1 > > .nops 2 > > .nops 3 > > ... > > .nops 9 > > > > And then build an assembler_nops[ASM_NOP_MAX+1] like it's done for the > > other nops. Then you could do: > > > > toolchain_nops_are_ideal = true; > > for ( i = 1; i < ASM_NOP_MAX+1; i++ ) > > if ( memcmp(ideal_nops[i], assembler_nops[i], i) ) > > { > > toolchain_nops_are_ideal = false; > > break; > > } > > > > In order to make sure all the possible nop sizes are using the > > expected optimized version. > > What is the point? Other than the 0f 1f, it really doesn't matter, and > small variations won't invalidate them as ideal nops. > > This check needs to be just good enough to tell
Re: [Xen-devel] [PATCH] x86/build: Use new .nops directive when available
On 16/08/18 15:31, Roger Pau Monné wrote: > On Thu, Aug 16, 2018 at 11:42:56AM +0100, Andrew Cooper wrote: >> On 16/08/18 10:55, Roger Pau Monné wrote: >>> On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote: diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk index ac585a3..c84ed20 100644 --- a/xen/arch/x86/Rules.mk +++ b/xen/arch/x86/Rules.mk @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID) $(call as-option-add,CFLAGS,CC,\ ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE) +# Check to see whether the assmbler supports the .nop directive. +$(call as-option-add,CFLAGS,CC,\ +".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE) >>> I think I remember commenting on an earlier version of this about the >>> usage of the CONTROL parameter. I would expect the assembler to >>> use the most optimized version by default, is that not the case? >> Again, I don't understand what you're trying to say. >> >> This expression is like this, because that's how we actually use it. > As mentioned in another email, I was wondering why we choose to not > use nop instructions > 9 bytes. The assembler will by default use > nop instructions up to 11 bytes for 64bit code. Using more than 9 bytes is suboptimal on some hardware. Toolchains use long nops (exclusively?) for basic block alignment, whereas we use use them for executing through because its still faster than a branch. > + CFLAGS += -mno-red-zone -fpic -fno-asynchronous-unwind-tables # Xen doesn't use SSE interally. If the compiler supports it, also skip the diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c index 0ef7a8b..2c844d6 100644 --- a/xen/arch/x86/alternative.c +++ b/xen/arch/x86/alternative.c @@ -84,6 +84,19 @@ static const unsigned char * const p6_nops[ASM_NOP_MAX+1] init_or_livepatch_cons static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops; +#ifdef HAVE_AS_NOP_DIRECTIVE + +/* Nops in .init.rodata to compare against the runtime ideal nops. */ +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t" + "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t" + ".popsection\n\t"); +extern char toolchain_nops[ASM_NOP_MAX]; +static bool __read_mostly toolchain_nops_are_ideal; + +#else +# define toolchain_nops_are_ideal false +#endif + static void __init arch_init_ideal_nops(void) { switch ( boot_cpu_data.x86_vendor ) @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void) ideal_nops = k8_nops; break; } + +#ifdef HAVE_AS_NOP_DIRECTIVE +if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 ) +toolchain_nops_are_ideal = true; +#endif >>> You are only comparing that the biggest nop instruction (9 bytes >>> AFAICT) generated by the assembler is what Xen believes to be the more >>> optimized version. What about shorter nops? >> They are all variations on a theme. >> >> For P6 nops, its the 0f 1f root which is important, which takes a modrm >> byte. Traditionally, its always encoded with eax and uses redundant >> memory encodings for longer instructions. >> >> I can't think of any way of detecting if the optimised nops if the >> toolchain starts using alternative registers in the encoding, but I >> expect this case won't happen in practice. > I would rather do: > > toolchain_nops: > .nops 1 > .nops 2 > .nops 3 > ... > .nops 9 > > And then build an assembler_nops[ASM_NOP_MAX+1] like it's done for the > other nops. Then you could do: > > toolchain_nops_are_ideal = true; > for ( i = 1; i < ASM_NOP_MAX+1; i++ ) > if ( memcmp(ideal_nops[i], assembler_nops[i], i) ) > { > toolchain_nops_are_ideal = false; > break; > } > > In order to make sure all the possible nop sizes are using the > expected optimized version. What is the point? Other than the 0f 1f, it really doesn't matter, and small variations won't invalidate them as ideal nops. This check needs to be just good enough to tell whether the toolchain used P6 or K8 nops, and unless you explicitly built with -march=k8, the answer is going to be P6. It does not need to check every variation byte for byte. Going back to my original argument for not even doing this basic check, if we happen to be wrong and the toolchain wrote the other kind of long nops, you probably won't be able to measure the difference. ~Andrew ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH] x86/build: Use new .nops directive when available
On Thu, Aug 16, 2018 at 11:42:56AM +0100, Andrew Cooper wrote: > On 16/08/18 10:55, Roger Pau Monné wrote: > > On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote: > >> diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk > >> index ac585a3..c84ed20 100644 > >> --- a/xen/arch/x86/Rules.mk > >> +++ b/xen/arch/x86/Rules.mk > >> @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid > >> (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID) > >> $(call as-option-add,CFLAGS,CC,\ > >> ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE) > >> > >> +# Check to see whether the assmbler supports the .nop directive. > >> +$(call as-option-add,CFLAGS,CC,\ > >> +".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE) > > I think I remember commenting on an earlier version of this about the > > usage of the CONTROL parameter. I would expect the assembler to > > use the most optimized version by default, is that not the case? > > Again, I don't understand what you're trying to say. > > This expression is like this, because that's how we actually use it. As mentioned in another email, I was wondering why we choose to not use nop instructions > 9 bytes. The assembler will by default use nop instructions up to 11 bytes for 64bit code. > > > >> + > >> CFLAGS += -mno-red-zone -fpic -fno-asynchronous-unwind-tables > >> > >> # Xen doesn't use SSE interally. If the compiler supports it, also skip > >> the > >> diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c > >> index 0ef7a8b..2c844d6 100644 > >> --- a/xen/arch/x86/alternative.c > >> +++ b/xen/arch/x86/alternative.c > >> @@ -84,6 +84,19 @@ static const unsigned char * const > >> p6_nops[ASM_NOP_MAX+1] init_or_livepatch_cons > >> > >> static const unsigned char * const *ideal_nops init_or_livepatch_data = > >> p6_nops; > >> > >> +#ifdef HAVE_AS_NOP_DIRECTIVE > >> + > >> +/* Nops in .init.rodata to compare against the runtime ideal nops. */ > >> +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t" > >> + "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t" > >> + ".popsection\n\t"); > >> +extern char toolchain_nops[ASM_NOP_MAX]; > >> +static bool __read_mostly toolchain_nops_are_ideal; > >> + > >> +#else > >> +# define toolchain_nops_are_ideal false > >> +#endif > >> + > >> static void __init arch_init_ideal_nops(void) > >> { > >> switch ( boot_cpu_data.x86_vendor ) > >> @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void) > >> ideal_nops = k8_nops; > >> break; > >> } > >> + > >> +#ifdef HAVE_AS_NOP_DIRECTIVE > >> +if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == > >> 0 ) > >> +toolchain_nops_are_ideal = true; > >> +#endif > > You are only comparing that the biggest nop instruction (9 bytes > > AFAICT) generated by the assembler is what Xen believes to be the more > > optimized version. What about shorter nops? > > They are all variations on a theme. > > For P6 nops, its the 0f 1f root which is important, which takes a modrm > byte. Traditionally, its always encoded with eax and uses redundant > memory encodings for longer instructions. > > I can't think of any way of detecting if the optimised nops if the > toolchain starts using alternative registers in the encoding, but I > expect this case won't happen in practice. I would rather do: toolchain_nops: .nops 1 .nops 2 .nops 3 ... .nops 9 And then build an assembler_nops[ASM_NOP_MAX+1] like it's done for the other nops. Then you could do: toolchain_nops_are_ideal = true; for ( i = 1; i < ASM_NOP_MAX+1; i++ ) if ( memcmp(ideal_nops[i], assembler_nops[i], i) ) { toolchain_nops_are_ideal = false; break; } In order to make sure all the possible nop sizes are using the expected optimized version. Roger. ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH] x86/build: Use new .nops directive when available
>>> On 16.08.18 at 13:48, wrote: > On 16/08/18 12:34, Jan Beulich wrote: > On 16.08.18 at 12:42, wrote: >>> On 16/08/18 10:55, Roger Pau Monné wrote: On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote: > @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void) > ideal_nops = k8_nops; > break; > } > + > +#ifdef HAVE_AS_NOP_DIRECTIVE > +if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == > 0 > ) > +toolchain_nops_are_ideal = true; > +#endif You are only comparing that the biggest nop instruction (9 bytes AFAICT) generated by the assembler is what Xen believes to be the more optimized version. What about shorter nops? >>> They are all variations on a theme. >>> >>> For P6 nops, its the 0f 1f root which is important, which takes a modrm >>> byte. Traditionally, its always encoded with eax and uses redundant >>> memory encodings for longer instructions. >>> >>> I can't think of any way of detecting if the optimised nops if the >>> toolchain starts using alternative registers in the encoding, but I >>> expect this case won't happen in practice. >> It's not just the register encoding, but also the maximum single-insn >> length that gets generated. Recall that until not very long ago we >> had up to 8-byte NOP insns only? The view on the mod (as in ModRM) >> usage may vary over time, as may the view on which or how many >> prefixes are reasonable to have. > > Strictly speaking, the ORM says "encode the least-recently live > register", because all the hint nops are still subject to reg/reg > dependencies. > > However, we definitely can't take advantage of this, nor can the > assembler. Well, _we_ could, at least when tail padding patched in insns: I very much hope we know what we've patched in, and hence at least what registers were used most recently. This is not readily available today, but could be made so. > Compilers can't either, because the exact length of the nop > depends on other relocations. Furthermore, the perf improvement from > doing this would be fractional. True. Jan ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH] x86/build: Use new .nops directive when available
>>> On 16.08.18 at 13:57, wrote: > On Thu, Aug 16, 2018 at 04:18:03AM -0600, Jan Beulich wrote: >> >>> On 16.08.18 at 11:55, wrote: >> > On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote: >> >> --- a/xen/arch/x86/Rules.mk >> >> +++ b/xen/arch/x86/Rules.mk >> >> @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid > (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID) >> >> $(call as-option-add,CFLAGS,CC,\ >> >> ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE) >> >> >> >> +# Check to see whether the assmbler supports the .nop directive. >> >> +$(call as-option-add,CFLAGS,CC,\ >> >> +".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE) >> > >> > I think I remember commenting on an earlier version of this about the >> > usage of the CONTROL parameter. I would expect the assembler to >> > use the most optimized version by default, is that not the case? >> >> How could it, without knowing what the target hardware is? And even >> if it could, what is considered "most optimized" tends to change every >> once in a while (or else we wouldn't have different NOP flavors to >> begin with). > > I think I haven't explained myself well. By using the CONTROL > parameter we limit the max length of a nop instruction to 9 bytes > instead of using the maximum supported by the assembler (11 bytes IIRC > for 64bit). Is this done because there are issues with using nops > 9 > bytes? There the problems start that I've hinted at towards Andrew: gas supports up to 11-byte NOPs despite the SDM saying otherwise at present. But ORM and SDM disagree as well, for the 3- and 6-byte variants. Otoh AMD recommends up to 11-byte NOPs for Fam15 and earlier, and suggests even using up to 15-byte ones on Fam17. Besides that I also wonder whether in 64-bit mode REX prefixes wouldn't be preferable over operand size override ones. Jan ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH] x86/build: Use new .nops directive when available
On Thu, Aug 16, 2018 at 04:18:03AM -0600, Jan Beulich wrote: > >>> On 16.08.18 at 11:55, wrote: > > On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote: > >> --- a/xen/arch/x86/Rules.mk > >> +++ b/xen/arch/x86/Rules.mk > >> @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid > >> (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID) > >> $(call as-option-add,CFLAGS,CC,\ > >> ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE) > >> > >> +# Check to see whether the assmbler supports the .nop directive. > >> +$(call as-option-add,CFLAGS,CC,\ > >> +".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE) > > > > I think I remember commenting on an earlier version of this about the > > usage of the CONTROL parameter. I would expect the assembler to > > use the most optimized version by default, is that not the case? > > How could it, without knowing what the target hardware is? And even > if it could, what is considered "most optimized" tends to change every > once in a while (or else we wouldn't have different NOP flavors to > begin with). I think I haven't explained myself well. By using the CONTROL parameter we limit the max length of a nop instruction to 9 bytes instead of using the maximum supported by the assembler (11 bytes IIRC for 64bit). Is this done because there are issues with using nops > 9 bytes? Roger. ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH] x86/build: Use new .nops directive when available
On 16/08/18 12:34, Jan Beulich wrote: On 16.08.18 at 12:42, wrote: >> On 16/08/18 10:55, Roger Pau Monné wrote: >>> On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote: @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void) ideal_nops = k8_nops; break; } + +#ifdef HAVE_AS_NOP_DIRECTIVE +if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 ) +toolchain_nops_are_ideal = true; +#endif >>> You are only comparing that the biggest nop instruction (9 bytes >>> AFAICT) generated by the assembler is what Xen believes to be the more >>> optimized version. What about shorter nops? >> They are all variations on a theme. >> >> For P6 nops, its the 0f 1f root which is important, which takes a modrm >> byte. Traditionally, its always encoded with eax and uses redundant >> memory encodings for longer instructions. >> >> I can't think of any way of detecting if the optimised nops if the >> toolchain starts using alternative registers in the encoding, but I >> expect this case won't happen in practice. > It's not just the register encoding, but also the maximum single-insn > length that gets generated. Recall that until not very long ago we > had up to 8-byte NOP insns only? The view on the mod (as in ModRM) > usage may vary over time, as may the view on which or how many > prefixes are reasonable to have. Strictly speaking, the ORM says "encode the least-recently live register", because all the hint nops are still subject to reg/reg dependencies. However, we definitely can't take advantage of this, nor can the assembler. Compilers can't either, because the exact length of the nop depends on other relocations. Furthermore, the perf improvement from doing this would be fractional. IOW, I don't expect we'll realistically see encodings other than the exact byte layout described in the ORM. ~Andrew ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH] x86/build: Use new .nops directive when available
>>> On 16.08.18 at 12:42, wrote: > On 16/08/18 10:55, Roger Pau Monné wrote: >> On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote: >>> @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void) >>> ideal_nops = k8_nops; >>> break; >>> } >>> + >>> +#ifdef HAVE_AS_NOP_DIRECTIVE >>> +if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 >>> ) >>> +toolchain_nops_are_ideal = true; >>> +#endif >> You are only comparing that the biggest nop instruction (9 bytes >> AFAICT) generated by the assembler is what Xen believes to be the more >> optimized version. What about shorter nops? > > They are all variations on a theme. > > For P6 nops, its the 0f 1f root which is important, which takes a modrm > byte. Traditionally, its always encoded with eax and uses redundant > memory encodings for longer instructions. > > I can't think of any way of detecting if the optimised nops if the > toolchain starts using alternative registers in the encoding, but I > expect this case won't happen in practice. It's not just the register encoding, but also the maximum single-insn length that gets generated. Recall that until not very long ago we had up to 8-byte NOP insns only? The view on the mod (as in ModRM) usage may vary over time, as may the view on which or how many prefixes are reasonable to have. Jan ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH] x86/build: Use new .nops directive when available
On 16/08/18 10:55, Roger Pau Monné wrote: > On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote: >> Newer versions of binutils are capable of emitting an exact number bytes >> worth >> of optimised nops, which are P6 nops. Use this in preference to .skip when >> available. >> >> Check at boot time whether the toolchain nops are the correct for the running >> hardware, andskip optimising nops entirely when possible. >^ missing space. > > TBH I'm not sure I see the benefit of using .nops over using .skip. In this case, or in general? In general, so we don't need to self/cross modify the alternatives points which aren't patched. In this case, because it is the .nops directive we're using to insert nops. > Xen needs to do a memcmp in order to check whether the resulting nops > are what Xen considers the more optimized instructions for the CPU > currently running on. Xen can avoid the memcpy by using skip, because > in that case Xen knows exactly the current instructions and there's no > need to memcmp. I'm afraid I don't understand what point you are attempting to make here. > I guess the reason is that the memcmp will be done only once, and > hopefully in most cases the assembler generated nops will be the most > optimized version. The memcmp() is once during init, and you've got to be on very ancient hardware for the toolchain nops to not be the correct ones. I'm going to conservatively estimate that 98% of hardware running Xen will have P6 nops as ideal. >> Signed-off-by: Andrew Cooper >> --- >> CC: Jan Beulich >> CC: Konrad Rzeszutek Wilk >> CC: Roger Pau Monné >> CC: Wei Liu >> --- >> xen/arch/x86/Rules.mk | 4 >> xen/arch/x86/alternative.c| 20 +++- >> xen/include/asm-x86/alternative-asm.h | 12 +++- >> xen/include/asm-x86/alternative.h | 11 +-- >> 4 files changed, 43 insertions(+), 4 deletions(-) >> >> diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk >> index ac585a3..c84ed20 100644 >> --- a/xen/arch/x86/Rules.mk >> +++ b/xen/arch/x86/Rules.mk >> @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid >> (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID) >> $(call as-option-add,CFLAGS,CC,\ >> ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE) >> >> +# Check to see whether the assmbler supports the .nop directive. >> +$(call as-option-add,CFLAGS,CC,\ >> +".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE) > I think I remember commenting on an earlier version of this about the > usage of the CONTROL parameter. I would expect the assembler to > use the most optimized version by default, is that not the case? Again, I don't understand what you're trying to say. This expression is like this, because that's how we actually use it. > >> + >> CFLAGS += -mno-red-zone -fpic -fno-asynchronous-unwind-tables >> >> # Xen doesn't use SSE interally. If the compiler supports it, also skip the >> diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c >> index 0ef7a8b..2c844d6 100644 >> --- a/xen/arch/x86/alternative.c >> +++ b/xen/arch/x86/alternative.c >> @@ -84,6 +84,19 @@ static const unsigned char * const p6_nops[ASM_NOP_MAX+1] >> init_or_livepatch_cons >> >> static const unsigned char * const *ideal_nops init_or_livepatch_data = >> p6_nops; >> >> +#ifdef HAVE_AS_NOP_DIRECTIVE >> + >> +/* Nops in .init.rodata to compare against the runtime ideal nops. */ >> +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t" >> + "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t" >> + ".popsection\n\t"); >> +extern char toolchain_nops[ASM_NOP_MAX]; >> +static bool __read_mostly toolchain_nops_are_ideal; >> + >> +#else >> +# define toolchain_nops_are_ideal false >> +#endif >> + >> static void __init arch_init_ideal_nops(void) >> { >> switch ( boot_cpu_data.x86_vendor ) >> @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void) >> ideal_nops = k8_nops; >> break; >> } >> + >> +#ifdef HAVE_AS_NOP_DIRECTIVE >> +if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 ) >> +toolchain_nops_are_ideal = true; >> +#endif > You are only comparing that the biggest nop instruction (9 bytes > AFAICT) generated by the assembler is what Xen believes to be the more > optimized version. What about shorter nops? They are all variations on a theme. For P6 nops, its the 0f 1f root which is important, which takes a modrm byte. Traditionally, its always encoded with eax and uses redundant memory encodings for longer instructions. I can't think of any way of detecting if the optimised nops if the toolchain starts using alternative registers in the encoding, but I expect this case won't happen in practice. > I also see a chance that maybe newer assembler versions will at some > point generate more optimized nops, but Xen will replace them with not > so optimized versions if the
Re: [Xen-devel] [PATCH] x86/build: Use new .nops directive when available
>>> On 16.08.18 at 11:55, wrote: > On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote: >> --- a/xen/arch/x86/Rules.mk >> +++ b/xen/arch/x86/Rules.mk >> @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid >> (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID) >> $(call as-option-add,CFLAGS,CC,\ >> ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE) >> >> +# Check to see whether the assmbler supports the .nop directive. >> +$(call as-option-add,CFLAGS,CC,\ >> +".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE) > > I think I remember commenting on an earlier version of this about the > usage of the CONTROL parameter. I would expect the assembler to > use the most optimized version by default, is that not the case? How could it, without knowing what the target hardware is? And even if it could, what is considered "most optimized" tends to change every once in a while (or else we wouldn't have different NOP flavors to begin with). Jan ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH] x86/build: Use new .nops directive when available
On Wed, Aug 15, 2018 at 06:57:38PM +0100, Andrew Cooper wrote: > Newer versions of binutils are capable of emitting an exact number bytes worth > of optimised nops, which are P6 nops. Use this in preference to .skip when > available. > > Check at boot time whether the toolchain nops are the correct for the running > hardware, andskip optimising nops entirely when possible. ^ missing space. TBH I'm not sure I see the benefit of using .nops over using .skip. Xen needs to do a memcmp in order to check whether the resulting nops are what Xen considers the more optimized instructions for the CPU currently running on. Xen can avoid the memcpy by using skip, because in that case Xen knows exactly the current instructions and there's no need to memcmp. I guess the reason is that the memcmp will be done only once, and hopefully in most cases the assembler generated nops will be the most optimized version. > Signed-off-by: Andrew Cooper > --- > CC: Jan Beulich > CC: Konrad Rzeszutek Wilk > CC: Roger Pau Monné > CC: Wei Liu > --- > xen/arch/x86/Rules.mk | 4 > xen/arch/x86/alternative.c| 20 +++- > xen/include/asm-x86/alternative-asm.h | 12 +++- > xen/include/asm-x86/alternative.h | 11 +-- > 4 files changed, 43 insertions(+), 4 deletions(-) > > diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk > index ac585a3..c84ed20 100644 > --- a/xen/arch/x86/Rules.mk > +++ b/xen/arch/x86/Rules.mk > @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid > (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID) > $(call as-option-add,CFLAGS,CC,\ > ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE) > > +# Check to see whether the assmbler supports the .nop directive. > +$(call as-option-add,CFLAGS,CC,\ > +".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE) I think I remember commenting on an earlier version of this about the usage of the CONTROL parameter. I would expect the assembler to use the most optimized version by default, is that not the case? > + > CFLAGS += -mno-red-zone -fpic -fno-asynchronous-unwind-tables > > # Xen doesn't use SSE interally. If the compiler supports it, also skip the > diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c > index 0ef7a8b..2c844d6 100644 > --- a/xen/arch/x86/alternative.c > +++ b/xen/arch/x86/alternative.c > @@ -84,6 +84,19 @@ static const unsigned char * const p6_nops[ASM_NOP_MAX+1] > init_or_livepatch_cons > > static const unsigned char * const *ideal_nops init_or_livepatch_data = > p6_nops; > > +#ifdef HAVE_AS_NOP_DIRECTIVE > + > +/* Nops in .init.rodata to compare against the runtime ideal nops. */ > +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t" > + "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t" > + ".popsection\n\t"); > +extern char toolchain_nops[ASM_NOP_MAX]; > +static bool __read_mostly toolchain_nops_are_ideal; > + > +#else > +# define toolchain_nops_are_ideal false > +#endif > + > static void __init arch_init_ideal_nops(void) > { > switch ( boot_cpu_data.x86_vendor ) > @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void) > ideal_nops = k8_nops; > break; > } > + > +#ifdef HAVE_AS_NOP_DIRECTIVE > +if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 ) > +toolchain_nops_are_ideal = true; > +#endif You are only comparing that the biggest nop instruction (9 bytes AFAICT) generated by the assembler is what Xen believes to be the more optimized version. What about shorter nops? I also see a chance that maybe newer assembler versions will at some point generate more optimized nops, but Xen will replace them with not so optimized versions if the Xen logic is not so up to date. > } > > /* Use this to add nops to a buffer, then text_poke the whole buffer. */ > @@ -209,7 +227,7 @@ void init_or_livepatch apply_alternatives(struct > alt_instr *start, > base->priv = 1; > > /* Nothing useful to do? */ > -if ( a->pad_len <= 1 ) > +if ( toolchain_nops_are_ideal || a->pad_len <= 1 ) > continue; > > add_nops(buf, a->pad_len); > diff --git a/xen/include/asm-x86/alternative-asm.h > b/xen/include/asm-x86/alternative-asm.h > index 0b61516..0d6fb4b 100644 > --- a/xen/include/asm-x86/alternative-asm.h > +++ b/xen/include/asm-x86/alternative-asm.h > @@ -1,6 +1,8 @@ > #ifndef _ASM_X86_ALTERNATIVE_ASM_H_ > #define _ASM_X86_ALTERNATIVE_ASM_H_ > > +#include > + > #ifdef __ASSEMBLY__ > > /* > @@ -19,6 +21,14 @@ > .byte 0 /* priv */ > .endm > > +.macro mknops nr_bytes > +#ifdef HAVE_AS_NOP_DIRECTIVE > +.nops \nr_bytes, ASM_NOP_MAX > +#else > +.skip \nr_bytes, 0x90 Use P6_NOP1 instead of open coding 0x90? Or have a #define DEFAULT_NOP P6_NOP1 And use it instead. Thanks, Roger.
[Xen-devel] [PATCH] x86/build: Use new .nops directive when available
Newer versions of binutils are capable of emitting an exact number bytes worth of optimised nops, which are P6 nops. Use this in preference to .skip when available. Check at boot time whether the toolchain nops are the correct for the running hardware, andskip optimising nops entirely when possible. Signed-off-by: Andrew Cooper --- CC: Jan Beulich CC: Konrad Rzeszutek Wilk CC: Roger Pau Monné CC: Wei Liu --- xen/arch/x86/Rules.mk | 4 xen/arch/x86/alternative.c| 20 +++- xen/include/asm-x86/alternative-asm.h | 12 +++- xen/include/asm-x86/alternative.h | 11 +-- 4 files changed, 43 insertions(+), 4 deletions(-) diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk index ac585a3..c84ed20 100644 --- a/xen/arch/x86/Rules.mk +++ b/xen/arch/x86/Rules.mk @@ -29,6 +29,10 @@ $(call as-option-add,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID) $(call as-option-add,CFLAGS,CC,\ ".if ((1 > 0) < 0); .error \"\";.endif",,-DHAVE_AS_NEGATIVE_TRUE) +# Check to see whether the assmbler supports the .nop directive. +$(call as-option-add,CFLAGS,CC,\ +".L1: .L2: .nops (.L2 - .L1)$$(comma)9",-DHAVE_AS_NOP_DIRECTIVE) + CFLAGS += -mno-red-zone -fpic -fno-asynchronous-unwind-tables # Xen doesn't use SSE interally. If the compiler supports it, also skip the diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c index 0ef7a8b..2c844d6 100644 --- a/xen/arch/x86/alternative.c +++ b/xen/arch/x86/alternative.c @@ -84,6 +84,19 @@ static const unsigned char * const p6_nops[ASM_NOP_MAX+1] init_or_livepatch_cons static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops; +#ifdef HAVE_AS_NOP_DIRECTIVE + +/* Nops in .init.rodata to compare against the runtime ideal nops. */ +asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t" + "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t" + ".popsection\n\t"); +extern char toolchain_nops[ASM_NOP_MAX]; +static bool __read_mostly toolchain_nops_are_ideal; + +#else +# define toolchain_nops_are_ideal false +#endif + static void __init arch_init_ideal_nops(void) { switch ( boot_cpu_data.x86_vendor ) @@ -112,6 +125,11 @@ static void __init arch_init_ideal_nops(void) ideal_nops = k8_nops; break; } + +#ifdef HAVE_AS_NOP_DIRECTIVE +if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 ) +toolchain_nops_are_ideal = true; +#endif } /* Use this to add nops to a buffer, then text_poke the whole buffer. */ @@ -209,7 +227,7 @@ void init_or_livepatch apply_alternatives(struct alt_instr *start, base->priv = 1; /* Nothing useful to do? */ -if ( a->pad_len <= 1 ) +if ( toolchain_nops_are_ideal || a->pad_len <= 1 ) continue; add_nops(buf, a->pad_len); diff --git a/xen/include/asm-x86/alternative-asm.h b/xen/include/asm-x86/alternative-asm.h index 0b61516..0d6fb4b 100644 --- a/xen/include/asm-x86/alternative-asm.h +++ b/xen/include/asm-x86/alternative-asm.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_ALTERNATIVE_ASM_H_ #define _ASM_X86_ALTERNATIVE_ASM_H_ +#include + #ifdef __ASSEMBLY__ /* @@ -19,6 +21,14 @@ .byte 0 /* priv */ .endm +.macro mknops nr_bytes +#ifdef HAVE_AS_NOP_DIRECTIVE +.nops \nr_bytes, ASM_NOP_MAX +#else +.skip \nr_bytes, 0x90 +#endif +.endm + /* GAS's idea of true is -1, while Clang's idea is 1. */ #ifdef HAVE_AS_NEGATIVE_TRUE # define as_true(x) (-(x)) @@ -29,7 +39,7 @@ #define decl_orig(insn, padding) \ .L\@_orig_s: insn; .L\@_orig_e: \ .L\@_diff = padding; \ - .skip as_true(.L\@_diff > 0) * .L\@_diff, 0x90; \ + mknops (as_true(.L\@_diff > 0) * .L\@_diff); \ .L\@_orig_p: #define orig_len (.L\@_orig_e - .L\@_orig_s) diff --git a/xen/include/asm-x86/alternative.h b/xen/include/asm-x86/alternative.h index 619472e..84b4854 100644 --- a/xen/include/asm-x86/alternative.h +++ b/xen/include/asm-x86/alternative.h @@ -2,7 +2,6 @@ #define __X86_ALTERNATIVE_H__ #include -#include #ifndef __ASSEMBLY__ #include @@ -27,6 +26,14 @@ extern void add_nops(void *insns, unsigned int len); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); extern void alternative_instructions(void); +asm ( ".macro mknops nr_bytes\n\t" +#ifdef HAVE_AS_NOP_DIRECTIVE + ".nops \\nr_bytes, " __stringify(ASM_NOP_MAX) "\n\t" +#else + ".skip \\nr_bytes, 0x90\n\t" +#endif + ".endm\n\t" ); + #define alt_orig_len "(.LXEN%=_orig_e - .LXEN%=_orig_s)" #define alt_pad_len"(.LXEN%=_orig_p - .LXEN%=_orig_e)" #define alt_total_len "(.LXEN%=_orig_p - .LXEN%=_orig_s)" @@ -46,7 +53,7 @@ extern void alternative_instructions(void); #define OLDINSTR(oldinstr, padding) \ ".LXEN%=_orig_s:\n\t" oldinstr "\n