Re: [PATCH] lib: Make _find_next_bit helper function inline

2015-08-29 Thread Yury


On 24.08.2015 01:53, Alexey Klimov wrote:

Hi Cassidy,


On Wed, Jul 29, 2015 at 11:40 PM, Cassidy Burden  wrote:

I changed the test module to now set the entire array to all 0/1s and
only flip a few bits. There appears to be a performance benefit, but
it's only 2-3% better (if that). If the main benefit of the original
patch was to save space then inlining definitely doesn't seem worth the
small gains in real use cases.

find_next_zero_bit (us)
old  new inline
1444017080   17086
4779 51815069
1084412720   12746
9642 11312   11253
3858 38183668
1054012349   12307
1247014716   14697
5403 60025942
2282 18201418
1363216056   15998
1104813019   13030
6025 67906706
1325515586   15605
3038 27442539
1035312219   12239
1049812251   12322
1476717452   17454
1278515048   15052
1655 1034691
9924 11611   11558

find_next_bit (us)
old  new inline
8535 99369667
1466617372   16880
2315 17991355
6578 90928806
6548 75587274
9448 11213   10821
3467 34973449
2719 30792911
6115 79897796
1358216113   15643
4643 49464766
3406 37283536
7118 90458805
3174 30112701
1330016780   16252
1428516848   16330
1158313669   13207
1306315455   14989
1266114955   14500
1206814166   13790

On 7/29/2015 6:30 AM, Alexey Klimov wrote:


I will re-check on another machine. It's really interesting if
__always_inline makes things better for aarch64 and worse for x86_64. It
will be nice if someone will check it on x86_64 too.



Very odd, this may be related to the other compiler optimizations Yuri
mentioned?


It's better to ask Yury, i hope he can answer some day.

Do you need to re-check this (with more iterations or on another machine(s))?



Hi, Alexey, Cassidy,

(restoring Rasmus, George)

I found no difference between original and inline versions for x86_64:
(Intel(R) Core(TM) i7-2630QM CPU @ 2.00GHz)

find_next_bit   find_next_zero_bit
old new inline  old new inline
24  27  28  22  28  28
24  27  28  23  27  28
24  27  28  23  27  28

Inspecting assembler code, I found that GCC wants to see helper separated,
even if you provide '__always_inline':

inline :   current :  

280:cmp%rdx,%rsi210:cmp%rdx,%rsi
283:jbe295  213:  jbe227 

285:test   %rsi,%rsi215:test   %rsi,%rsi
288:je 295  218:  je 227 

28a:push   %rbp 21a:push   %rbp
28b:mov%rsp,%rbp21b:xor%ecx,%ecx
28e:callq  0  21d:  mov%rsp,%rbp
293:pop%rbp 220:callq  0 
<_find_next_bit.part.0>
294:retq225:pop%rbp
295:mov%rsi,%rax226:retq
298:retq227:mov%rsi,%rax
299:nopl   0x0(%rax)22a:retq
22b:nopl   0x0(%rax,%rax,1)

So things are looking like x86_64 gcc (at least 4.9.2 build for Ubuntu)
ignores '__always_inline' hint as well as 'inline'. But in case of
__always_inline compiler does something not really smart: it introduces
 and  helpers
and so increases text size from 0x250 to 0x2b9 bytes, but doesn't really
inline to optimize push/pop and call/ret. I don't like inline, as I
already told, but I believe that complete disabling is bad idea.
Maybe someone knows another trick to make inline work?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] arm64: don't load 32-bit binaries if platform has no aarch32_el0

2015-09-01 Thread Yury
On Tue, Sep 01, 2015 at 05:04:26PM +0100, Mark Rutland wrote:
> On Tue, Sep 01, 2015 at 03:41:12PM +0100, Yury Norov wrote:
> > Kernel option COMPAT defines the ability of executing aarch32 binaries.
> > Some platforms does not support aarch32 mode, and so cannot execute that
> > binaries. But we cannot just disable COMPAT for them because the same
> > kernel binary may be used by multiple platforms.
> > 
> > In this patch, system_supports_aarch32_el0() is introduced to detect
> > aarch32 support at run-time.
> > 
> > Signed-off-by: Yury Norov 
> > ---
> >  arch/arm64/include/asm/cpufeature.h | 1 +
> >  arch/arm64/include/asm/elf.h| 6 --
> >  arch/arm64/kernel/cpuinfo.c | 9 +
> >  3 files changed, 14 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/cpufeature.h 
> > b/arch/arm64/include/asm/cpufeature.h
> > index 20cdc26..d24ea15 100644
> > --- a/arch/arm64/include/asm/cpufeature.h
> > +++ b/arch/arm64/include/asm/cpufeature.h
> > @@ -81,6 +81,7 @@ void check_local_cpu_errata(void);
> >  void check_local_cpu_features(void);
> >  bool cpu_supports_mixed_endian_el0(void);
> >  bool system_supports_mixed_endian_el0(void);
> > +bool system_supports_aarch32_el0(void);
> >  
> >  #endif /* __ASSEMBLY__ */
> >  
> > diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
> > index faad6df..461897b 100644
> > --- a/arch/arm64/include/asm/elf.h
> > +++ b/arch/arm64/include/asm/elf.h
> > @@ -21,6 +21,7 @@
> >  /*
> >   * ELF register definitions..
> >   */
> > +#include 
> >  #include 
> >  #include 
> >  
> > @@ -173,8 +174,9 @@ typedef compat_elf_greg_t   
> > compat_elf_gregset_t[COMPAT_ELF_NGREG];
> >  
> >  /* AArch32 EABI. */
> >  #define EF_ARM_EABI_MASK   0xff00
> > -#define compat_elf_check_arch(x)   (((x)->e_machine == EM_ARM) && \
> > -((x)->e_flags & EF_ARM_EABI_MASK))
> > +#define compat_elf_check_arch(x)   (system_supports_aarch32_el0()  \
> > +   && ((x)->e_machine == EM_ARM)   \
> > +   && ((x)->e_flags & EF_ARM_EABI_MASK))
> >  
> >  #define compat_start_threadcompat_start_thread
> >  #define COMPAT_SET_PERSONALITY(ex) set_thread_flag(TIF_32BIT);
> > diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
> > index 75d5a86..95d953f 100644
> > --- a/arch/arm64/kernel/cpuinfo.c
> > +++ b/arch/arm64/kernel/cpuinfo.c
> > @@ -79,6 +79,15 @@ bool system_supports_mixed_endian_el0(void)
> > return mixed_endian_el0;
> >  }
> >  
> > +#define AARCH641
> > +#define AARCH32_64 2
> 
> These should be better namespaced. Perhaps ID_AA64PFR0_EL1_EL0_64 and
> ID_AA64PFR0_EL1_EL0_6432 ?
> 
> > +bool system_supports_aarch32_el0(void)
> > +{
> > +   struct cpuinfo_arm64 *info = this_cpu_ptr(&cpu_data);
> > +   u64 arm64_el0 = info->reg_id_aa64pfr0 & 0xf;
> > +   return arm64_el0 == AARCH32_64;
> > +}
> 
> We should handle this the same way as we do for endianness support and
> check that all CPUs support AArch32, and set a global flag, rather than
> assuming that all CPUs are symmetric. Likewise for any other feature we
> have to dynamically detect.
> 

OK. Will do in V2

> Thanks,
> Mark.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] lib: find_*_bit reimplementation

2015-01-25 Thread Yury
On 24.01.2015 03:45, Rasmus Villemoes wrote:
> On Mon, Jan 19 2015, Yury Norov  wrote:
>
>> New implementation takes less space, and, I hope, easier to understand.
>>
>> Signed-off-by: Yury Norov 
>> ---
>>  lib/find_next_bit.c | 265 
>> +++-
>>  1 file changed, 73 insertions(+), 192 deletions(-)
>>
> That diffstat is certainly nice. Do you also have numbers for the
> size of the generated code, and do you know if there is a
> measurable performance difference? Have you tested whether the new and
> old code give the same results, also in corner cases?
Hello, Rasmus. Thank you for your time.

> Do you also have numbers for the size of the generated code

Before text section is 817 bytes, after - 533.
Comparing to this version, I have the patch modified now, and numbers may 
little differ.

> Have you tested whether the new and old code give the same results, also in 
> corner cases?

I tested new version together with old one, and if new did return different 
value,
it was printed to system log. I didn't measure performance because I don't 
expect
significant gain here. But now I think it's good idea to write tests for 
performance and
corner cases. Maybe someone already did it... Brief googling didn't help. 
Anyway, with
new version of patch I will show my measures.
>
> Some remarks inline below.
>
>> diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
>> index 0cbfc0b..a5c915f 100644
>> --- a/lib/find_next_bit.c
>> +++ b/lib/find_next_bit.c
>> @@ -11,10 +11,39 @@
>>  
>>  #include 
>>  #include 
>> +#include 
>>  #include 
>>  #include 
>>  
>> -#define BITOP_WORD(nr)  ((nr) / BITS_PER_LONG)
>> +#define HIGH_BITS_MASK(nr)  (ULONG_MAX << (nr))
>> +#define MIN(a, b)   ((a) < (b) ? (a) : (b))
>> +
> Please don't duplicate min/max macros. kernel.h already provides everything 
> you need.
Ok.
>
>> +#if !defined(find_next_bit) || !defined(find_next_zero_bit)
>> +static unsigned long _find_next_bit(const unsigned long *addr,
>> +unsigned long end, unsigned long start, unsigned long flags)
> Having two parameters called end and start appearing in that
> order is slightly confusing. Why not keep the name 'size' for
> end, or maybe 'nbits' to make the unit clear. Also, I think flags
> should just be a bool and maybe renamed to something more meaningful.
You're right, Something like this:
 static unsigned long _find_next_bit(const unsigned long *addr,
unsigned long nbits, unsigned long start_bit, bool set)
looks better.
>
>> +{
>> +unsigned long tmp = flags ? addr[start / BITS_PER_LONG]
>> +: ~addr[start / BITS_PER_LONG];
>> +
>> +/* Handle 1st word. */
>> +if (!IS_ALIGNED(start, BITS_PER_LONG)) {
>> +tmp &= HIGH_BITS_MASK(start % BITS_PER_LONG);
>> +start = round_down(start, BITS_PER_LONG);
>> +}
>> +
>> +do {
>> +if (tmp)
>> +return MIN(start + __ffs(tmp), end);
>> +
>> +start += BITS_PER_LONG;
>> +if (start >= end)
>> +return end;
>> +
>> +tmp = flags ? addr[start / BITS_PER_LONG]
>> +: ~addr[start / BITS_PER_LONG];
>> +} while (1);
>> +}
>> +#endif
>>  
>>  #ifndef find_next_bit
>>  /*
>> @@ -23,86 +52,16 @@
>>  unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
>>  unsigned long offset)
>>  {
>> -const unsigned long *p = addr + BITOP_WORD(offset);
>> -unsigned long result = offset & ~(BITS_PER_LONG-1);
>> -unsigned long tmp;
>> -
>> -if (offset >= size)
>> -return size;
> The previous versions handled this, but your code will always access
> the word at addr[start/BITS_PER_LONG]. Are you sure no caller ever
> passes start >= size?
You're right. Fixed.
>
>> -size -= result;
>> -offset %= BITS_PER_LONG;
>> -if (offset) {
>> -tmp = *(p++);
>> -tmp &= (~0UL << offset);
>> -if (size < BITS_PER_LONG)
>> -goto found_first;
>> -if (tmp)
>> -goto found_middle;
>> -size -= BITS_PER_LONG;
>> -result += BITS_PER_LONG;
>> -}
>> -while (size & ~(BITS_PER_LONG-1)) {
>> -if ((tmp = *(p++)))
>&g

Re: [PATCH v2] lib: bitmap_[empty,full]: remove code duplication

2015-03-31 Thread Yury

On 01.04.2015 02:06, Andrew Morton wrote:

On Sun, 29 Mar 2015 05:03:55 +0300 Yury Norov  wrote:


Function 'bitmap_empty' has it's own implementation.
But it's clearly as simple as:
"find_first_bit(src, nbits) == nbits"
The same is true for 'bitmap_full'.

Looks OK.

Please send a Signed-off-by: for this patch.

Hello Andrew,

Is it enough?

Signed-off-by: Yury Norov 

BR,
Yury Norov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] selinux: reduce locking overhead in inode_free_security()

2015-06-13 Thread Yury



On 13.06.2015 01:35, Waiman Long wrote:

On 06/12/2015 08:31 AM, Stephen Smalley wrote:

On 06/12/2015 02:26 AM, Raghavendra K T wrote:

On 06/12/2015 03:01 AM, Waiman Long wrote:
The inode_free_security() function just took the superblock's 
isec_lock
before checking and trying to remove the inode security struct from 
the

linked list. In many cases, the list was empty and so the lock taking
is wasteful as no useful work is done. On multi-socket systems with
a large number of CPUs, there can also be a fair amount of spinlock
contention on the isec_lock if many tasks are exiting at the same 
time.


This patch changes the code to check the state of the list first
before taking the lock and attempting to dequeue it. As this function
is called indirectly from __destroy_inode(), there can't be another
instance of inode_free_security() running on the same inode.

Signed-off-by: Waiman Long
---
   security/selinux/hooks.c |   15 ---
   1 files changed, 12 insertions(+), 3 deletions(-)

v1->v2:
   - Take out the second list_empty() test inside the lock.

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7dade28..e5cdad7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -254,10 +254,19 @@ static void inode_free_security(struct inode
*inode)
   struct inode_security_struct *isec = inode->i_security;
   struct superblock_security_struct *sbsec = 
inode->i_sb->s_security;


-spin_lock(&sbsec->isec_lock);
-if (!list_empty(&isec->list))
+/*
+ * As not all inode security structures are in a list, we 
check for
+ * empty list outside of the lock to make sure that we won't 
waste

+ * time taking a lock doing nothing. As inode_free_security() is
+ * being called indirectly from __destroy_inode(), there is no 
way

+ * there can be two or more concurrent calls. So doing the
list_empty()
+ * test outside the loop should be safe.
+ */
+if (!list_empty(&isec->list)) {
+spin_lock(&sbsec->isec_lock);
   list_del_init(&isec->list);

Stupid question,

I need to take a look at list_del_init() code, but it can so happen 
that

if !list_empty() check could happen simultaneously, then serially two
list_del_init() can happen.

is that not a problem()?

Hmm...I suppose that's possible (sb_finish_set_opts and
inode_free_security could both perform the list_del_init).  Ok, we'll
stay with the first version.



Actually, list_del_init() can be applied twice with no harm being 
done. The first list_del_init() will set list-> next = list->prev = 
list. The second one will do the same thing and so it should be safe.


Cheers,
Longman



Hello, Waiman!

At first, minor.
For me, moving the line 'if (!list_empty(&isec->list))' out of lock is 
not possible just because 'inode_free_security' is called from 
'__destroy_inode' only. You cannot rely on it in future. It's rather 
possible because empty list is invariant under 'list_del_init', as you 
noted here. In fact, you can call 'list_del_init' unconditionally here, 
and condition is the only optimization to decrease lock contention. So, 
I'd like to ask you reflect it in your comment.


At second, less minor.
Now that you access list element outside of the lock, why don't you use 
'list_empty_careful' instead of 'list_empty'? It may eliminate possible 
race between, say, 'list_add' and 'list_empty', and costs you virtually 
nothing.


Best regards,
Yury



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3] selinux: reduce locking overhead in inode_free_security()

2015-06-15 Thread Yury

On 15.06.2015 20:13, Waiman Long wrote:

The inode_free_security() function just took the superblock's isec_lock
before checking and trying to remove the inode security struct from the
linked list. In many cases, the list was empty and so the lock taking
is wasteful as no useful work is done. On multi-socket systems with
a large number of CPUs, there can also be a fair amount of spinlock
contention on the isec_lock if many tasks are exiting at the same time.

This patch changes the code to check the state of the list first before
taking the lock and attempting to dequeue it. The list_del_init()
can be called more than once on the same list with no harm as long
as they are properly serialized. It should not be possible to have
inode_free_security() called concurrently with list_add(). For better
safety, however, we use list_empty_careful() here even though it is
still not completely safe in case that happens.

Signed-off-by: Waiman Long 
---
  security/selinux/hooks.c |   17 ++---
  1 files changed, 14 insertions(+), 3 deletions(-)

v1->v2:
  - Take out the second list_empty() test inside the lock.

v2->v3:
  - Fix incorrent comment and commit log message.

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7dade28..2a99804 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -254,10 +254,21 @@ static void inode_free_security(struct inode *inode)
struct inode_security_struct *isec = inode->i_security;
struct superblock_security_struct *sbsec = inode->i_sb->s_security;
  
-	spin_lock(&sbsec->isec_lock);

-   if (!list_empty(&isec->list))
+   /*
+* As not all inode security structures are in a list, we check for
+* empty list outside of the lock to make sure that we won't waste
+* time taking a lock doing nothing.
+*
+* The list_del_init() function can be safely called more than once.
+* It should not be possible for this function to be called with
+* concurrent list_add(), but for better safety against future changes
+* in the code, we use list_empty_careful() here.
+*/
+   if (!list_empty_careful(&isec->list)) {
+   spin_lock(&sbsec->isec_lock);
list_del_init(&isec->list);
-   spin_unlock(&sbsec->isec_lock);
+   spin_unlock(&sbsec->isec_lock);
+   }
  
  	/*

 * The inode may still be referenced in a path walk and

Hi Waiman,

If you need my Acked-by, you have it.

BR,
Yury
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] lib: Make _find_next_bit helper function inline

2015-07-28 Thread Yury

On 28.07.2015 22:09, Cassidy Burden wrote:

I've tested Yury Norov's find_bit reimplementation with the test_find_bit
module (https://lkml.org/lkml/2015/3/8/141) and measured about 35-40%
performance degradation on arm64 3.18 run with fixed CPU frequency.

The performance degradation appears to be caused by the
helper function _find_next_bit. After inlining this function into
find_next_bit and find_next_zero_bit I get slightly better performance
than the old implementation:

find_next_zero_bit  find_next_bit
old  new inline old  new inline
26   36  24 24   33  23
25   36  24 24   33  23
26   36  24 24   33  23
25   36  24 24   33  23
25   36  24 24   33  23
25   37  24 24   33  23
25   37  24 24   33  23
25   37  24 24   33  23
25   36  24 24   33  23
25   37  24 24   33  23

Signed-off-by: Cassidy Burden 
Cc: Alexey Klimov 
Cc: David S. Miller 
Cc: Daniel Borkmann 
Cc: Hannes Frederic Sowa 
Cc: Lai Jiangshan 
Cc: Mark Salter 
Cc: AKASHI Takahiro 
Cc: Thomas Graf 
Cc: Valentin Rothberg 
Cc: Chris Wilson 
---
  lib/find_bit.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/find_bit.c b/lib/find_bit.c
index 18072ea..d0e04f9 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -28,7 +28,7 @@
   * find_next_zero_bit.  The difference is the "invert" argument, which
   * is XORed with each fetched word before searching it for one bits.
   */
-static unsigned long _find_next_bit(const unsigned long *addr,
+static inline unsigned long _find_next_bit(const unsigned long *addr,
unsigned long nbits, unsigned long start, unsigned long invert)
  {
unsigned long tmp;


Hi Cassidi,

At first, I'm really surprised that there's no assembler implementation
of find_bit routines for aarch64. Aarch32 has ones...

I was thinking on inlining the helper, but decided not to do this

1. Test is not too realistic. https://lkml.org/lkml/2015/2/1/224
The typical usage pattern is to look for a single bit or range of bits.
So in practice nobody calls find_next_bit thousand times.

2. Way more important to fit functions into as less cache lines as
possible. https://lkml.org/lkml/2015/2/12/114
In this case, inlining increases cache lines consumption almost twice...

3. Inlining prevents compiler from some other possible optimizations. It's
probable that in real module compiler will inline callers of _find_next_bit,
and final output will be better. I don't like to point out the compiler how
it should do its work.

Nevertheless, if this is your real case, and inlining helps, I'm OK with it.

But I think, before/after for x86 is needed as well.
And why don't you consider '__always_inline__'? Simple inline is only a 
hint and

guarantees nothing.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] lib: Make _find_next_bit helper function inline

2015-07-28 Thread Yury

On 29.07.2015 00:23, Yury wrote:

On 28.07.2015 22:09, Cassidy Burden wrote:
I've tested Yury Norov's find_bit reimplementation with the 
test_find_bit

module (https://lkml.org/lkml/2015/3/8/141) and measured about 35-40%
performance degradation on arm64 3.18 run with fixed CPU frequency.

The performance degradation appears to be caused by the
helper function _find_next_bit. After inlining this function into
find_next_bit and find_next_zero_bit I get slightly better performance
than the old implementation:

find_next_zero_bit  find_next_bit
old  new inline old  new inline
26   36  24 24   33  23
25   36  24 24   33  23
26   36  24 24   33  23
25   36  24 24   33  23
25   36  24 24   33  23
25   37  24 24   33  23
25   37  24 24   33  23
25   37  24 24   33  23
25   36  24 24   33  23
25   37  24 24   33  23

Signed-off-by: Cassidy Burden 
Cc: Alexey Klimov 
Cc: David S. Miller 
Cc: Daniel Borkmann 
Cc: Hannes Frederic Sowa 
Cc: Lai Jiangshan 
Cc: Mark Salter 
Cc: AKASHI Takahiro 
Cc: Thomas Graf 
Cc: Valentin Rothberg 
Cc: Chris Wilson 
---
  lib/find_bit.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/find_bit.c b/lib/find_bit.c
index 18072ea..d0e04f9 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -28,7 +28,7 @@
   * find_next_zero_bit.  The difference is the "invert" argument, which
   * is XORed with each fetched word before searching it for one bits.
   */
-static unsigned long _find_next_bit(const unsigned long *addr,
+static inline unsigned long _find_next_bit(const unsigned long *addr,
  unsigned long nbits, unsigned long start, unsigned long 
invert)

  {
  unsigned long tmp;


Hi Cassidi,

At first, I'm really surprised that there's no assembler implementation
of find_bit routines for aarch64. Aarch32 has ones...

I was thinking on inlining the helper, but decided not to do this

1. Test is not too realistic. https://lkml.org/lkml/2015/2/1/224
The typical usage pattern is to look for a single bit or range of bits.
So in practice nobody calls find_next_bit thousand times.

2. Way more important to fit functions into as less cache lines as
possible. https://lkml.org/lkml/2015/2/12/114
In this case, inlining increases cache lines consumption almost twice...

3. Inlining prevents compiler from some other possible optimizations. 
It's
probable that in real module compiler will inline callers of 
_find_next_bit,
and final output will be better. I don't like to point out the 
compiler how

it should do its work.

Nevertheless, if this is your real case, and inlining helps, I'm OK 
with it.


But I think, before/after for x86 is needed as well.
And why don't you consider '__always_inline__'? Simple inline is only 
a hint and

guarantees nothing.


(Sorry for typo in your name. Call me Yuri next time.)

Adding Rasmus and George to CC

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 2/7] mm: kasan: introduce generic kasan_populate_zero_shadow()

2015-07-27 Thread Yury

> Introduce generic kasan_populate_zero_shadow(start, end).
> This function maps kasan_zero_page to the [start, end] addresses.
>
> In follow on patches it will be used for ARMv8 (and maybe other
> architectures) and will replace x86_64 specific populate_zero_shadow().
>
> Signed-off-by: Andrey Ryabinin 
> ---
>  arch/x86/mm/kasan_init_64.c |  14 
>  include/linux/kasan.h   |   8 +++
>  mm/kasan/Makefile   |   2 +-
>  mm/kasan/kasan_init.c   | 151 


>  4 files changed, 160 insertions(+), 15 deletions(-)
>  create mode 100644 mm/kasan/kasan_init.c
>
> diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
> index e1840f3..812086c 100644
> --- a/arch/x86/mm/kasan_init_64.c
> +++ b/arch/x86/mm/kasan_init_64.c
> @@ -12,20 +12,6 @@
>  extern pgd_t early_level4_pgt[PTRS_PER_PGD];
>  extern struct range pfn_mapped[E820_X_MAX];
>
> -static pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
> -static pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
> -static pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
> -
> -/*
> - * This page used as early shadow. We don't use empty_zero_page
> - * at early stages, stack instrumentation could write some garbage
> - * to this page.
> - * Latter we reuse it as zero shadow for large ranges of memory
> - * that allowed to access, but not instrumented by kasan
> - * (vmalloc/vmemmap ...).
> - */
> -static unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
> -
>  static int __init map_range(struct range *range)
>  {
>  unsigned long start;
> diff --git a/include/linux/kasan.h b/include/linux/kasan.h
> index 6fb1c7d..d795f53 100644
> --- a/include/linux/kasan.h
> +++ b/include/linux/kasan.h
> @@ -12,8 +12,16 @@ struct vm_struct;
>  #define KASAN_SHADOW_SCALE_SHIFT 3
>
>  #include 
> +#include 
>  #include 
>
> +extern unsigned char kasan_zero_page[PAGE_SIZE];
> +extern pte_t kasan_zero_pte[PTRS_PER_PTE];
> +extern pmd_t kasan_zero_pmd[PTRS_PER_PMD];
> +extern pud_t kasan_zero_pud[PTRS_PER_PUD];
> +
> +void kasan_populate_zero_shadow(const void *from, const void *to);
> +
>  static inline void *kasan_mem_to_shadow(const void *addr)
>  {
>  return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT)
> diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
> index bd837b8..6471014 100644
> --- a/mm/kasan/Makefile
> +++ b/mm/kasan/Makefile
> @@ -5,4 +5,4 @@ CFLAGS_REMOVE_kasan.o = -pg
>  # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
>  CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack 
-fno-stack-protector)

>
> -obj-y := kasan.o report.o
> +obj-y := kasan.o report.o kasan_init.o
> diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
> new file mode 100644
> index 000..e276853
> --- /dev/null
> +++ b/mm/kasan/kasan_init.c
> @@ -0,0 +1,151 @@
> +/*
> + * This file contains some kasan initialization code.
> + *
> + * Copyright (c) 2015 Samsung Electronics Co., Ltd.
> + * Author: Andrey Ryabinin
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include 
> +#include 
> +
> +/*
> + * This page serves two purposes:
> + *   - It used as early shadow memory. The entire shadow region 
populated
> + * with this page, before we will be able to setup normal shadow 
memory.
> + *   - Latter it reused it as zero shadow to cover large ranges of 
memory
> + * that allowed to access, but not handled by kasan 
(vmalloc/vmemmap ...).

> + */
> +unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
> +
> +#if CONFIG_PGTABLE_LEVELS > 3
> +pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
> +#endif
> +#if CONFIG_PGTABLE_LEVELS > 2
> +pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
> +#endif

You declare kasan_zero_pud and kasan_zero_pmd conditionally now, but use
unconditionally, at least in kasan_init in patch #5. If I'm not missing
something, this is wrong...

> +pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
> +
> +static __init void *early_alloc(size_t size, int node)
> +{
> +return memblock_virt_alloc_try_nid(size, size, 
__pa(MAX_DMA_ADDRESS),

> +BOOTMEM_ALLOC_ACCESSIBLE, node);
> +}
> +
> +static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
> +unsigned long end)
> +{
> +pte_t *pte = pte_offset_kernel(pmd, addr);
> +pte_t zero_pte;
> +
> +zero_pte = pfn_pte(PFN_DOWN(__pa(kasan_zero_page)), PAGE_KERNEL);
> +zero_pte = pte_wrprotect(zero_pte);
> +
> +while (addr + PAGE_SIZE <= end) {
> +set_pte_at(&init_mm, addr, pte, zero_pte);
> +addr += PAGE_SIZE;
> +pte = pte_offset_kernel(pmd, addr);
> +}
> +}
> +
> +static void __init zero_pmd_populate(pud_t *pud, unsigned long addr,

Re: [PATCH v5 1/5] arm/arm64: add smccc ARCH32

2015-08-19 Thread Yury

On 19.08.2015 11:40, Jens Wiklander wrote:
>
> Adds helpers to do SMC based on ARM SMC Calling Convention.
> CONFIG_HAVE_SMCCC is enabled for architectures that may support
> the SMC instruction. It's the responsibility of the caller to
> know if the SMC instruction is supported by the platform.
>
> Signed-off-by: Jens Wiklander 
> ---
>  arch/arm/Kconfig   |  4 +++
>  arch/arm/kernel/Makefile   |  2 ++
>  arch/arm/kernel/smccc-call.S   | 26 ++
>  arch/arm/kernel/smccc.c| 17 +
>  arch/arm64/Kconfig |  4 +++
>  arch/arm64/kernel/Makefile |  1 +
>  arch/arm64/kernel/smccc-call.S | 34 ++
>  arch/arm64/kernel/smccc.c  | 17 +
>  include/linux/arm-smccc.h  | 79 
++

>  9 files changed, 184 insertions(+)
>  create mode 100644 arch/arm/kernel/smccc-call.S
>  create mode 100644 arch/arm/kernel/smccc.c
>  create mode 100644 arch/arm64/kernel/smccc-call.S
>  create mode 100644 arch/arm64/kernel/smccc.c
>  create mode 100644 include/linux/arm-smccc.h
>
> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> index 45df48b..75e4da3 100644
> --- a/arch/arm/Kconfig
> +++ b/arch/arm/Kconfig
> @@ -221,6 +221,9 @@ config NEED_RET_TO_USER
>  config ARCH_MTD_XIP
>bool
>
> +config HAVE_SMCCC
> +  bool
> +
>  config VECTORS_BASE
>hex
>default 0x if MMU || CPU_HIGH_VECTOR
> @@ -324,6 +327,7 @@ config ARCH_MULTIPLATFORM
>select CLKSRC_OF
>select COMMON_CLK
>select GENERIC_CLOCKEVENTS
> +  select HAVE_SMCCC if CPU_V7
>select MIGHT_HAVE_PCI
>select MULTI_IRQ_HANDLER
>select SPARSE_IRQ
> diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
> index 752725d..8cdd25b 100644
> --- a/arch/arm/kernel/Makefile
> +++ b/arch/arm/kernel/Makefile
> @@ -90,4 +90,6 @@ obj-y+= psci.o psci-call.o
>  obj-$(CONFIG_SMP) += psci_smp.o
>  endif
>
> +obj-$(CONFIG_HAVE_SMCCC)  += smccc-call.o smccc.o
> +
>  extra-y := $(head-y) vmlinux.lds
> diff --git a/arch/arm/kernel/smccc-call.S b/arch/arm/kernel/smccc-call.S
> new file mode 100644
> index 000..05bc554
> --- /dev/null
> +++ b/arch/arm/kernel/smccc-call.S
> @@ -0,0 +1,26 @@
> +/*
> + * Copyright (c) 2015, Linaro Limited
> + *
> + * This software is licensed under the terms of the GNU General Public
> + * License version 2, as published by the Free Software Foundation, and
> + * may be copied, distributed, and modified under those terms.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + */
> +#include 
> +
> +#include 
> +
> +/* void smccc_call32(struct smccc_param32 *param) */
> +ENTRY(smccc_call32)
> +  push{r4-r8, lr}
> +  mov r8, r0
> +  ldm r8, {r0-r7}
> +  __SMC(0)
> +  stm r8, {r0-r7}
> +  pop {r4-r8, pc}
> +ENDPROC(smccc_call32)
> diff --git a/arch/arm/kernel/smccc.c b/arch/arm/kernel/smccc.c
> new file mode 100644
> index 000..ba4039e
> --- /dev/null
> +++ b/arch/arm/kernel/smccc.c
> @@ -0,0 +1,17 @@
> +/*
> + * Copyright (c) 2015, Linaro Limited
> + *
> + * This software is licensed under the terms of the GNU General Public
> + * License version 2, as published by the Free Software Foundation, and
> + * may be copied, distributed, and modified under those terms.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + */
> +#include 
> +#include 
> +
> +EXPORT_SYMBOL_GPL(smccc_call32);
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 7796af4..b3ea778 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -83,6 +83,7 @@ config ARM64
>select SPARSE_IRQ
>select SYSCTL_EXCEPTION_TRACE
>select HAVE_CONTEXT_TRACKING
> +  select HAVE_SMCCC
>help
>  ARM 64-bit (AArch64) Linux support.
>
> @@ -146,6 +147,9 @@ config KERNEL_MODE_NEON
>  config FIX_EARLYCON_MEM
>def_bool y
>
> +config HAVE_SMCCC
> +  bool
> +
>  config PGTABLE_LEVELS
>int
>default 2 if ARM64_64K_PAGES && ARM64_VA_BITS_42
> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index 426d076..f7804f7 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -36,6 +36,7 @@ arm64-obj-$(CONFIG_EFI)			+= efi.o efi-stub.o 
efi-entry.o

>  arm64-obj-$(CONFIG_PCI)   += pci.o
>  arm64-obj-$(CONFIG_ARMV8_DEPRECATED)  += armv8_deprecated.o
>  arm64-obj-$(CONFIG_ACPI)  += acpi.o
> +arm64-obj-$(CONFIG_HAVE_SMCCC)+= smccc-call.o smccc.o
>
>  obj-y += $(arm64-obj-y) vdso/
>  obj-m 

Re: [PATCH v2 1/3] lib: find_*_bit reimplementation

2015-02-04 Thread Yury

On 02.02.2015 13:43, Rasmus Villemoes wrote:
> On Sat, Jan 31 2015, yury.no...@gmail.com wrote:
>
>> From: Yury Norov 
>>
>> New implementations takes less space in source file (see diffstat)
>> and in object. For me it's 710 vs 453 bytes of text.
>>
> New version generally looks good. Please include a summary of the
> changes between the versions either below the --- line or in a 0/n cover
> letter, especially since you've now expanded the scope of the series.
>
> Comments below.
>
>> Patch was boot-tested on x86_64 and MIPS (big-endian) machines.
>> Performance tests were ran on userspace with code like this:
>>
>>  /* addr[] is filled from /dev/urandom */
>>  start = clock();
>>  while (ret < nbits)
>>  ret = find_next_bit(addr, nbits, ret + 1);
>>
>>  end = clock();
>>  printf("%ld\t", (unsigned long) end - start);
>>
>> On Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz rezults are next:
>> (for find_next_bit, nbits is 8M, for find_first_bit - 80K)
>>
>>  find_next_bit:  find_first_bit:
>>  new current new current
>>  26932   43151   14777   14925
>>  26947   43182   14521   15423
>>  26507   43824   15053   14705
>>  27329   43759   14473   14777
>>  26895   43367   14847   15023
>>  26990   43693   15103   15163
>>  26775   43299   15067   15232
>>  27282   42752   14544   15121
>>  27504   43088   14644   14858
>>  26761   43856   14699   15193
>>  26692   43075   14781   14681
>>  27137   42969   14451   15061
>>  ... ...
>>
>> find_next_bit performance gain is 35-40%;
>> find_first_bit - no measurable difference.
>>
>> Signed-off-by: Yury Norov 
>> ---
>>  lib/find_last_bit.c |  31 ++-
>>  lib/find_next_bit.c | 254 
>> +++-
>>  2 files changed, 79 insertions(+), 206 deletions(-)
>>
>> diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
>> index 91ca09f..e67e970 100644
>> --- a/lib/find_last_bit.c
>> +++ b/lib/find_last_bit.c
>> @@ -4,44 +4,29 @@
>>   * Written by Rusty Russell 
>>   * (Inspired by David Howell's find_next_bit implementation)
>>   *
>> + * Rewritten by Yury Norov  to decrease
>> + * size and improve performance, 2015.
>> + *
>>   * This program is free software; you can redistribute it and/or
>>   * modify it under the terms of the GNU General Public License
>>   * as published by the Free Software Foundation; either version
>>   * 2 of the License, or (at your option) any later version.
>>   */
>>  
>> -#include 
> Why do you remove that #include? It is rather important that the header
> and implementation don't get out of sync. I know that kernel.h includes
> bitops.h, but please don't rely on such things. Quoting SubmitChecklist:
>
> 1: If you use a facility then #include the file that defines/declares
>that facility.  Don't depend on other header files pulling in ones
>that you use.
>
>
>>  #include 
>> -#include 
>> -#include 
> However, getting rid of includes that are no longer needed is certainly
> a good thing.
Yes, linux/bitops.h are to get back.
>> +#include 
>>  
>>  #ifndef find_last_bit
>>  
>>  unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
>>  {
>> -unsigned long words;
>> -unsigned long tmp;
>> -
>> -/* Start at final word. */
>> -words = size / BITS_PER_LONG;
>> -
>> -/* Partial final word? */
>> -if (size & (BITS_PER_LONG-1)) {
>> -tmp = (addr[words] & (~0UL >> (BITS_PER_LONG
>> - - (size & (BITS_PER_LONG-1);
>> -if (tmp)
>> -goto found;
>> -}
>> +unsigned long idx = DIV_ROUND_UP(size, BITS_PER_LONG);
>>  
>> -while (words) {
>> -tmp = addr[--words];
>> -if (tmp) {
>> -found:
>> -return words * BITS_PER_LONG + __fls(tmp);
>> -}
>> +while (idx--) {
>> +if (addr[idx])
>> +return min(idx * BITS_PER_LONG + __fls(addr[idx]), 
>> size);
>>  }
>>  
>> -/* Not found */
>>  return size;
>>  }
>>  EXPORT_SYMBOL(f

Re: [PATCH v2 1/3] lib: find_*_bit reimplementation

2015-02-04 Thread Yury

On 02.02.2015 06:17, George Spelvin wrote:
> Yury Norov  wrote:
>> New implementations takes less space in source file (see diffstat)
>> and in object. For me it's 710 vs 453 bytes of text.
>>
>> Patch was boot-tested on x86_64 and MIPS (big-endian) machines.
>> Performance tests were ran on userspace with code like this:
>>
>>  /* addr[] is filled from /dev/urandom */
>>  start = clock();
>>  while (ret < nbits)
>>  ret = find_next_bit(addr, nbits, ret + 1);
>>
>>  end = clock();
>>  printf("%ld\t", (unsigned long) end - start);
>> On Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz rezults are next:
>> (for find_next_bit, nbits is 8M, for find_first_bit - 80K)
>>
>>  find_next_bit:  find_first_bit:
>>  new current new current
>>  26932   43151   14777   14925
>>  26947   43182   14521   15423
> I'll look at this more carefully, but one immediate thought is that this
> is an unrealistic benchmark.  It will amost never need to look at more
> than one word of the array, but real arrays have long runs of zero
> bits to skip over.
>
> So the code size is appreciated, but the time benefits may be the result
> of you optimizing for the wrong thing.
>
> I'd try filling the array with mostly-identical bits, flipping with odds
> of 1/256 or so.
>
> For full generality, I'd test different 1->0 and 0->1 transition
> probabilities.  (But powers of two are probably enough for benchmarking.)
>
I think, test with random values represents at least one situation: 
well-fragmented memory
after long time work. (This is what I really have in my project.) In other 
hand, if long zero runs
is a typical behavior for one's system, it's a good opportunity for 
improvements, I think.
Anyway, the idea of testing find_bit on a long runs is good. Thank you.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/3] lib: find_*_bit reimplementation

2015-02-04 Thread Yury

On 02.02.2015 15:56, Rasmus Villemoes wrote:
> On Mon, Feb 02 2015, "George Spelvin"  wrote:
>
>> Rasmus Villemoes  wrote:
>>> ... and this be part of _find_next_bit? Can find_next_bit not be simply
>>> 'return _find_next_bit(addr, size, offset, 1);', and similarly for
>>> find_next_zero_bit? Btw., passing true and false for the boolean
>>> parameter may be a little clearer.
>> Looking at the generated code, it would be better to replace the boolean
>> parameter with 0ul or ~0ul and XOR with it.  The same number of registers,
>> and saves a conditional branch.
> Nice trick. When I compiled it, gcc inlined _find_next_bit into both its
> callers, making the conditional go away completely. That was with gcc
> 4.7. When I try with 5.0, I do see _find_next_bit being compiled
> separately.
>
> With the proposed change, 4.7 also makes find_next{,_zero}_bit wrappers
> for _find_next_bit, further reducing the total size, which is a good
> thing. And, if some other version decides to still inline it, it
> should then know how to optimize the xor with 0ul or ~0ul just as well
> as when the conditional was folded away. 
>
> Yury, please also incorporate this in the next round.
>
> Rasmus
>
Ok.
What are you thinking about joining _find_next_bit and _find_next_bit_le?
They really differ in 2 lines.  It's generally good to remove duplications,
and it may decrease text size for big-endian machines. But it definitely
doesn't make code easier for reading, and maybe affects performance
after the optimization suggested by George...

(I didn't test it yet)

 29 #if !defined(find_next_bit) || !defined(find_next_zero_bit) \
 30 || (defined(BIG_ENDIAN) && \
 31 (!defined(find_next_bit_le) || 
!defined(find_next_zero_bit_le)))
 32 static unsigned long _find_next_bit(const unsigned long *addr,
 33 unsigned long nbits, unsigned long start, unsigned long 
flags)
 34 {
 35 unsigned long xor_mask = (flags & SET) ? 0UL : ULONG_MAX;
 36 unsigned long tmp = addr[start / BITS_PER_LONG] ^ xor_mask;
 37 
 38 /* Handle 1st word. */
 39 if (!IS_ALIGNED(start, BITS_PER_LONG)) {
 40 #ifdef BIG_ENDIAN
 41 if (flags & LE)
 42 tmp &= ext2_swab(HIGH_BITS_MASK(start % 
BITS_PER_LONG));
 43 else
 44 #endif
 45 tmp &= HIGH_BITS_MASK(start % BITS_PER_LONG);
 46 
 47 start = round_down(start, BITS_PER_LONG);
 48 }
 49 
 50 while (!tmp) {
 51 start += BITS_PER_LONG;
 52 if (start >= nbits)
 53 return nbits;
 54 
 55 tmp = addr[start / BITS_PER_LONG] ^ xor_mask;
 56 }
 57 
 58 #ifdef BIG_ENDIAN
 59 if (flags & LE)
 60 return start + __ffs(ext2_swab(tmp));
 61 
 62 #endif
 63 return start + __ffs(tmp);
 64 }
 65 #endif


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 1/3] lib: find_*_bit reimplementation

2015-02-11 Thread Yury

On 09.02.2015 14:53, Rasmus Villemoes wrote:
> [Yury, please do remember to Cc everyone who has previously
> participated]
>
> On Mon, Feb 09 2015, "George Spelvin"  wrote:
>
>> Two more comments on the code.  Two minor, but one that
>> seems like a bug, so for now, it's
>>
>> Nacked-by: George Spelvin  
>>
>> Specifically, it seems like find_last_bit used to ignore trailing
>> garbage in the bitmap, but now will stop searching if the last word
>> contains some set bits not within size.
> True, though see below.
>
>> The minor one is that I don't think the first-word masking needs to
>> be conditional.  The general code works fine if the start is aligned
>> (HIGH_BITS_MASK just generates an all-ones mask), is quite quick, and
>> saves a test & conditional branch.
>>
> I also noted that during the first review, but when I tried to compile
> it gcc actually generated slightly worse code, so I decided not to
> comment on it. I don't have a strong preference either way, though.
>
>> Previously, the last word was masked, so bits beyond "size" were ignored.
>> With the revised code, something like find_last_bit(array, 96) will return 96
>> if array[1] >> 32 is non-zero, even if array[1] & 0x is zero.
>>
>> Looking through the callers, I haven't found a case where this matters yet
>> so perhaps it's a safe optimization, but this really needs to be more
>> clearly documented if intentional.
>>
>> If no change was desired, I'd think a good way to do this would be:
>>
>>  unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
>>  {
>>  size_t idx = DIV_ROUND_UP(size, BITS_PER_LONG);
>>  unsigned long tmp = addr[--idx];
>>
>>  tmp &= (2UL << (size % BITS_PER_LONG)) - 1; /* Mask last word */
>>
>>  while (!tmp) {
>>  if (!idx)
>>  return size;
>>  tmp = addr[--idx];
>>  }
>>  return idx * BITS_PER_LONG + __fls(tmp);
>> }
> How should that work? If size is for example 1, the mask evaluates to 3UL,
> while what is needed is 1UL. If size is aligned, the mask becomes 1UL,
> which is also not right.
>
> Also, I think it is best to handle size==0 appropriately, meaning that
> one cannot dereference addr in any way (and certainly not addr[-1]).
>
> So how about
>
> unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
> {
>   size_t idx = DIV_ROUND_UP(size, BITS_PER_LONG);
>   unsigned long mask = LAST_WORD_MASK(size);
>
>   while (idx--) {
>   unsigned long val = addr[idx] & mask;
>   if (val)
>   return idx * BITS_PER_LONG + __fls(val);
>   mask = ~0ul;
>   }
>   return size;
> }
>
> Rasmus
Rasmus, your version has ANDing by mask, and resetting the mask at each 
iteration
of main loop. I think we can avoid it. What do you think on next?

unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
{
size_t idx;
unsigned long tmp;

if (!size)
return 0;

idx = DIV_ROUND_UP(size, BITS_PER_LONG) - 1;
tmp = addr[idx] & LAST_WORD_MASK(size);

while (!tmp) {
if (!idx--)
return size;
tmp = addr[idx];
}
return idx * BITS_PER_LONG + __fls(tmp);
}

Yury
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V3] lib/bitmap.c: rewrite __bitmap_parse && __bitmap_parselist

2015-07-01 Thread Yury

> Subject: lib/bitmap.c: rewrite __bitmap_parse && __bitmap_parselist
scripts/checkpatch.pl 
lib_bitmap.c:-rewrite-__bitmap_parse-__bitmap_parselist.patch

total: 134 errors, 1 warnings, 284 lines checked

NOTE: whitespace errors detected, you may wish to use 
scripts/cleanpatch or

  scripts/cleanfile

Most of them are about DOS line endings, but it prevents me to apply 
your patch:
patch -p1 < 
lib_bitmap.c:-rewrite-__bitmap_parse-__bitmap_parselist.patch

(Stripping trailing CRs from patch; use --binary to disable.)
patching file lib/bitmap.c
Hunk #1 FAILED at 16.
Hunk #2 FAILED at 331.
Hunk #3 FAILED at 359.
Hunk #4 FAILED at 417.
Hunk #5 FAILED at 503.
5 out of 5 hunks FAILED -- saving rejects to file lib/bitmap.c.rej
>
> add __bitmap_parse_common to match any contents and return expected 
reslut.

>
> as __bitmap_parse_common need NULL-terminated string, we alloc a new buf.
>
> this patch also fix an unexpected parse result issue in 
__bitmap_parselist.

>
> Signed-off-by: Pan Xinhui 
> ---
>   lib/bitmap.c | 238 
+--

>   1 file changed, 134 insertions(+), 104 deletions(-)
> ---
> change log
> v3:
> __bitmap_parselist now allow some extra input, like ",2, 3 5-8", 
at least one digit inside.

> input like " " ", " is still not allowed.
>
> V2:
> __bitmap_parse_common need *base* to parse correct result
>
> V1:
> add __bitmap_parse_common and rewrite __bitmap_parse && 
__bitmap_parselist

> ---
>
> diff --git a/lib/bitmap.c b/lib/bitmap.c
> index 64c0926..f2095e1d 100644
> --- a/lib/bitmap.c
> +++ b/lib/bitmap.c
> @@ -16,6 +16,8 @@
>   #include 
>   #include 
>
> +#include 
> +#include 
>   /*
>* bitmaps provide an array of bits, implemented using an an
>* array of unsigned longs.  The number of valid bits in a
> @@ -331,6 +333,58 @@ again:
>   EXPORT_SYMBOL(bitmap_find_next_zero_area_off);
>
>   /*
> + * __bitmap_parse_common - parse expected number from buf
> + * Return 0 on success.
> + * there two patterns.
> + * if buf's contents did not match any of them, reutrn equivalent error.
> + * Notice buf's contents may be changed.
> + */
> +static int __bitmap_parse_common(char *buf, unsigned int buflen,
> + unsigned long *a, unsigned long *b, int base)

It looks weird, and I don't like in your version too much:
Name is bad.
There's nothing about bitmap. You just parsing a string for two 
patterns: %d, and %d-%d.


You do your work twice (at least): first you detect digits in 
match_token, then - in kstrtoul.

You allocate kbuf unconditionally, no matter you need it or not.
You do more than one thing in __bitmap_parse_common (you search number 
and region).

You modify initial string.

Let's consider a more straight interface:

/*
 * I don't know why this function is not written yet.
 * Maybe it's something ideological...
 */
void set_bits(unsigned long *bitmap, unsigned long start, unsigned 
long len);


/*
 * Takes care of all user whitespaces and commas,
 * Return endp, or error if parse fails, or null if string reached 
the end.

 */
char *parse_range(const char *buf, unsigned long *start, unsigned 
long *len);


than pattern usage would be:

while (str = parse_range(str, &start, &len)) {
if (IS_ERROR(str))
return ...;
if (start + len >= nbits)
return ...;

set_bits(bitmap, start, len);
}

> +{
> +int ret;
> +int token;
> +const match_table_t table = {
> +{
> +.token = 1,
> +.pattern = "%x",
> +},
> +{
> +.token = 2,
> +.pattern = "%x-%x",
> +},
> +{
> +.token = 0,
> +.pattern = NULL,
> +}
> +};
> +substring_t substr[MAX_OPT_ARGS];
> +
> +if (!buflen || !a)
> +return -EINVAL;
> +token = match_token((char *)buf, table, substr);
> +switch (token) {
> +case 1:
> +*substr[0].to = '\0';
> +ret = kstrtoul(substr[0].from, base, a);
> +if (b)
> +*b = *a;
> +break;
> +case 2:
> +*substr[0].to = '\0';
> +*substr[1].to = '\0';
> +ret = kstrtoul(substr[0].from, base, a);
> +ret |= b ? kstrtoul(substr[1].from, base, b) : -EINVAL;
> +break;
> +default:
> +ret = -EINVAL;
> +break;
> +}
> +return ret;
> +}
> +
> +/*
>* Bitmap printing & parsing functions: first version by Nadia 
Yvette Chambers,

>* second version by Paul Jackson, third by Joe Korty.
>*/
> @@ -359,57 +413,45 @@ int __bitmap_parse(const char *buf, unsigned 
int buflen,

>   int is_user, unsigned long *maskp,
>   int nmaskbits)
>   {
> -int c, old_c, totaldigits, ndigits, nchunks, nbits;
> +int nchunks, nbits, ret;
> +unsigned long a;
>   u32 chunk;
>   const char __user __force *ubuf = (const char __use

Re: [PATCH 14/19] arm64: ilp32: add sys_ilp32.c and a separate table (in entry.S) to use it

2016-06-20 Thread Yury Norov
On Sat, Jun 18, 2016 at 02:54:23AM +0300, Yury Norov wrote:
> From: Andrew Pinski 
> 
> Add a separate syscall-table for ILP32, which dispatches either to native
> LP64 system call implementation or to compat-syscalls, as appropriate.
> 
> Signed-off-by: Andrew Pinski 
> Signed-off-by: Yury Norov 
> ---
>  arch/arm64/include/asm/unistd.h |  6 ++-
>  arch/arm64/kernel/Makefile  |  2 +-
>  arch/arm64/kernel/entry.S   | 29 +-
>  arch/arm64/kernel/sys_ilp32.c   | 84 
> +
>  4 files changed, 117 insertions(+), 4 deletions(-)
>  create mode 100644 arch/arm64/kernel/sys_ilp32.c
> 
> diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
> index fe9d6c1..250654c 100644
> --- a/arch/arm64/include/asm/unistd.h
> +++ b/arch/arm64/include/asm/unistd.h
> @@ -13,13 +13,17 @@
>   * You should have received a copy of the GNU General Public License
>   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
>   */
> +
> +#ifdef CONFIG_COMPAT
> +#define __ARCH_WANT_SYS_LLSEEK
> +#endif
> +
>  #ifdef CONFIG_AARCH32_EL0
>  #define __ARCH_WANT_COMPAT_SYS_GETDENTS64
>  #define __ARCH_WANT_COMPAT_STAT64
>  #define __ARCH_WANT_SYS_GETHOSTNAME
>  #define __ARCH_WANT_SYS_PAUSE
>  #define __ARCH_WANT_SYS_GETPGRP
> -#define __ARCH_WANT_SYS_LLSEEK
>  #define __ARCH_WANT_SYS_NICE
>  #define __ARCH_WANT_SYS_SIGPENDING
>  #define __ARCH_WANT_SYS_SIGPROCMASK
> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index d11572d..205bc03 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -28,7 +28,7 @@ $(obj)/%.stub.o: $(obj)/%.o FORCE
>  arm64-obj-$(CONFIG_AARCH32_EL0)  += sys32.o kuser32.o signal32.o 
> \
>  sys_compat.o entry32.o   
> \
>  ../../arm/kernel/opcodes.o 
> binfmt_elf32.o
> -arm64-obj-$(CONFIG_ARM64_ILP32)  += binfmt_ilp32.o
> +arm64-obj-$(CONFIG_ARM64_ILP32)  += binfmt_ilp32.o sys_ilp32.o
>  arm64-obj-$(CONFIG_COMPAT)   += entry32_common.o
>  arm64-obj-$(CONFIG_FUNCTION_TRACER)  += ftrace.o entry-ftrace.o
>  arm64-obj-$(CONFIG_MODULES)  += arm64ksyms.o module.o
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 21a0624..b2b9725 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -239,6 +239,23 @@ tsk  .reqx28 // current thread_info
>  
>   .text
>  
> +#ifdef CONFIG_ARM64_ILP32
> +/*
> + * AARCH64/ILP32. Zero top halves of x0-x7
> + * registers as userspace may put garbage there.
> + */
> + .macro  delouse_input_regs
> + mov w0, w0
> + mov w1, w1
> + mov w2, w2
> + mov w3, w3
> + mov w4, w4
> + mov w5, w5
> + mov w6, w6
> + mov w7, w7
> + .endm
> +#endif
> +
>  /*
>   * Exception vectors.
>   */
> @@ -501,6 +518,7 @@ el0_svc_compat:
>* AArch32 syscall handling
>*/
>   adrpstbl, compat_sys_call_table // load compat syscall table 
> pointer
> + ldr x16, [tsk, #TI_FLAGS]
>   uxtwscno, w7// syscall number in w7 (r7)
>   mov sc_nr, #__NR_compat_syscalls
>   b   el0_svc_naked
> @@ -717,15 +735,22 @@ ENDPROC(ret_from_fork)
>   .align  6
>  el0_svc:
>   adrpstbl, sys_call_table// load syscall table pointer
> + ldr x16, [tsk, #TI_FLAGS]
>   uxtwscno, w8// syscall number in w8
>   mov sc_nr, #__NR_syscalls
> +#ifdef CONFIG_ARM64_ILP32
> + adrpx17, sys_call_ilp32_table   // load ilp32 syscall table 
> pointer
> + tst x16, #_TIF_32BIT_AARCH64
> + b.eqel0_svc_naked   // We are using LP64  syscall 
> table
> + mov stbl, x17   // We are using ILP32 syscall 
> table
> + delouse_input_regs
> +#endif

It should be like this:
#ifdef CONFIG_ARM64_ILP32
tst x16, #_TIF_32BIT_AARCH64
b.eqel0_svc_naked   // We are using LP64  syscall 
table
adrpstbl, sys_call_ilp32_table  // load ilp32 syscall table 
pointer
delouse_input_regs
#endif

>  el0_svc_naked:   // compat entry point
>   stp x0, scno, [sp, #S_ORIG_X0]  // save the original x0 and 
> syscall number
>   enable_dbg_and_irq
>   ct_user_exit 1
>  
> - ldr x16, [tsk, #TI_FLAGS]   // check for syscall hooks
> - tst x16, #_TIF_SYSCALL_WORK
> + tst x16, 

[PATCH] mm: slab.h: use ilog2() in kmalloc_index()

2016-06-20 Thread Yury Norov
kmalloc_index() uses simple straightforward way to calculate
bit position of nearest or equal upper power of 2.
This effectively results in generation of 24 episodes of
compare-branch instructions in assembler.

There is shorter way to calculate this: fls(size - 1).

The patch removes hard-coded calculation of kmalloc slab and
uses ilog2() instead that works on top of fls(). ilog2 is used
with intention that compiler also might optimize constant case
during compile time if it detects that.

BUG() is moved to the beginning of function. We left it here to
provide identical behaviour to previous version. It may be removed
if there's no requirement in it anymore.

While we're at this, fix comment that describes return value.

Reported-by: Alexey Klimov 
Signed-off-by: Yury Norov 
Signed-off-by: Alexey Klimov 
---
 include/linux/slab.h | 41 +
 1 file changed, 9 insertions(+), 32 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index aeb3e6d..294ef52 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -267,13 +267,16 @@ extern struct kmem_cache 
*kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
 /*
  * Figure out which kmalloc slab an allocation of a certain size
  * belongs to.
- * 0 = zero alloc
- * 1 =  65 .. 96 bytes
- * 2 = 129 .. 192 bytes
- * n = 2^(n-1)+1 .. 2^n
+ * 0 if zero alloc, or
+ * 1 if size is 65 .. 96 bytes, or
+ * 2 if size is 129 .. 192 bytes, or
+ * n if 2^(n - 1) < size <= 2^n
  */
 static __always_inline int kmalloc_index(size_t size)
 {
+   /* Bigger size is a bug */
+   BUG_ON(size > (1 << 26));
+
if (!size)
return 0;
 
@@ -284,34 +287,8 @@ static __always_inline int kmalloc_index(size_t size)
return 1;
if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
return 2;
-   if (size <=  8) return 3;
-   if (size <= 16) return 4;
-   if (size <= 32) return 5;
-   if (size <= 64) return 6;
-   if (size <=128) return 7;
-   if (size <=256) return 8;
-   if (size <=512) return 9;
-   if (size <=   1024) return 10;
-   if (size <=   2 * 1024) return 11;
-   if (size <=   4 * 1024) return 12;
-   if (size <=   8 * 1024) return 13;
-   if (size <=  16 * 1024) return 14;
-   if (size <=  32 * 1024) return 15;
-   if (size <=  64 * 1024) return 16;
-   if (size <= 128 * 1024) return 17;
-   if (size <= 256 * 1024) return 18;
-   if (size <= 512 * 1024) return 19;
-   if (size <= 1024 * 1024) return 20;
-   if (size <=  2 * 1024 * 1024) return 21;
-   if (size <=  4 * 1024 * 1024) return 22;
-   if (size <=  8 * 1024 * 1024) return 23;
-   if (size <=  16 * 1024 * 1024) return 24;
-   if (size <=  32 * 1024 * 1024) return 25;
-   if (size <=  64 * 1024 * 1024) return 26;
-   BUG();
-
-   /* Will never be reached. Needed because the compiler may complain */
-   return -1;
+
+   return ilog2(size - 1) + 1;
 }
 #endif /* !CONFIG_SLOB */
 
-- 
2.7.4



[PATCH 08/27] [AARCH64] Use PTR_* in start.S

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

To support ILP32 without much sources changes, this changes
sysdeps/aarch64/start.S to use the PTR_* macros which was defined
earlier.

* sysdeps/aarch64/start.S: Include sysdep.h
(_start): Use PTR_REG, PTR_SIZE macros.

Signed-off-by: Yury Norov 
---
 sysdeps/aarch64/start.S | 20 +++-
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/sysdeps/aarch64/start.S b/sysdeps/aarch64/start.S
index efe2474..9198c57 100644
--- a/sysdeps/aarch64/start.S
+++ b/sysdeps/aarch64/start.S
@@ -16,6 +16,8 @@
License along with the GNU C Library.  If not, see
<http://www.gnu.org/licenses/>.  */
 
+#include 
+
 /* This is the canonical entry point, usually the first thing in the text
segment.
 
@@ -25,7 +27,7 @@
 
At this entry point, most registers' values are unspecified, except:
 
-   x0  Contains a function pointer to be registered with `atexit'.
+   x0/w0   Contains a function pointer to be registered with `atexit'.
This is how the dynamic linker arranges to have DT_FINI
functions called for shared libraries that have been loaded
before this code runs.
@@ -52,26 +54,26 @@ _start:
mov x5, x0
 
/* Load argc and a pointer to argv */
-   ldr x1, [sp, #0]
-   add x2, sp, #8
+   ldr PTR_REG (1), [sp, #0]
+   add x2, sp, #PTR_SIZE
 
/* Setup stack limit in argument register */
mov x6, sp
 
 #ifdef SHARED
 adrpx0, :got:main
-   ldr x0, [x0, #:got_lo12:main]
+   ldr PTR_REG (0), [x0, #:got_lo12:main]
 
 adrpx3, :got:__libc_csu_init
-   ldr x3, [x3, #:got_lo12:__libc_csu_init]
+   ldr PTR_REG (3), [x3, #:got_lo12:__libc_csu_init]
 
 adrpx4, :got:__libc_csu_fini
-   ldr x4, [x4, #:got_lo12:__libc_csu_fini]
+   ldr PTR_REG (4), [x4, #:got_lo12:__libc_csu_fini]
 #else
/* Set up the other arguments in registers */
-   ldr x0, =main
-   ldr x3, =__libc_csu_init
-   ldr x4, =__libc_csu_fini
+   ldr PTR_REG (0), =main
+   ldr PTR_REG (3), =__libc_csu_init
+   ldr PTR_REG (4), =__libc_csu_fini
 #endif
 
/* __libc_start_main (main, argc, argv, init, fini, rtld_fini,
-- 
2.7.4



[PATCH 11/27] [AARCH64] Syscalls for ILP32 are passed always via 64bit values.

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

This patch adds support for ILP32 syscalls, sign and zero extending
where needed.  Unlike LP64, pointers are 32bit and need to be zero
extended rather than the standard sign extend that the code would do.
We take advatage of ssize_t being long rather than int for ILP32,
to get this correct.

* sysdeps/unix/sysv/linux/aarch64/sysdep.h
(INLINE_VSYSCALL): Use long long instead of long.
(INTERNAL_VSYSCALL): Likewise.
(INLINE_SYSCALL): Likewise.
(INTERNAL_SYSCALL_RAW): Likewise.
(ARGIFY): New macro.
(LOAD_ARGS_0): Use long long instead of long.
(LOAD_ARGS_1): Use long long instead of long
and use ARGIFY.
(LOAD_ARGS_2): Likewise.
(LOAD_ARGS_3): Likewise.
(LOAD_ARGS_4): Likewise.
(LOAD_ARGS_5): Likewise.
(LOAD_ARGS_6): Likewise.
(LOAD_ARGS_7): Likewise.

Signed-off-by: Yury Norov 
---
 sysdeps/unix/sysv/linux/aarch64/sysdep.h | 48 
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/sysdeps/unix/sysv/linux/aarch64/sysdep.h 
b/sysdeps/unix/sysv/linux/aarch64/sysdep.h
index 1ffabc2..42f89c8 100644
--- a/sysdeps/unix/sysv/linux/aarch64/sysdep.h
+++ b/sysdeps/unix/sysv/linux/aarch64/sysdep.h
@@ -161,11 +161,11 @@
call.  */
 # undef INLINE_SYSCALL
 # define INLINE_SYSCALL(name, nr, args...) \
-  ({ unsigned long _sys_result = INTERNAL_SYSCALL (name, , nr, args);  \
+  ({ unsigned long long _sys_result = INTERNAL_SYSCALL (name, , nr, args); 
\
  if (__builtin_expect (INTERNAL_SYSCALL_ERROR_P (_sys_result, ), 0))\
{   \
 __set_errno (INTERNAL_SYSCALL_ERRNO (_sys_result, ));  \
-_sys_result = (unsigned long) -1;  \
+_sys_result = (unsigned long long) -1; \
}   \
  (long) _sys_result; })
 
@@ -174,10 +174,10 @@
 
 # undef INTERNAL_SYSCALL_RAW
 # define INTERNAL_SYSCALL_RAW(name, err, nr, args...)  \
-  ({ long _sys_result; \
+  ({ long long _sys_result;\
  { \
LOAD_ARGS_##nr (args)   \
-   register long _x8 asm ("x8") = (name);  \
+   register long long _x8 asm ("x8") = (name); \
asm volatile ("svc  0   // syscall " # name \
 : "=r" (_x0) : "r"(_x8) ASM_ARGS_##nr : "memory"); \
_sys_result = _x0;  \
@@ -199,36 +199,48 @@
 # undef INTERNAL_SYSCALL_ERRNO
 # define INTERNAL_SYSCALL_ERRNO(val, err)  (-(val))
 
+/* Convert X to a long long, without losing any bits if it is one
+   already or warning if it is a 32-bit pointer.  This zero extends
+   32-bit pointers and sign extends other signed types.  Note this only
+   works because ssize_t is long and short-short is promoted to int.   */
+#define ARGIFY(X)  
\
+   ((unsigned long long)   
\
+ __builtin_choose_expr(__builtin_types_compatible_p(__typeof__(X), 
__typeof__((X) - (X))), \
+   (X),
\
+   __builtin_choose_expr(__builtin_types_compatible_p(int, 
__typeof__((X) - (X))), \
+ (X),  
\
+ (unsigned long)(X
+
 # define LOAD_ARGS_0() \
-  register long _x0 asm ("x0");
+  register long long _x0 asm ("x0");
 # define LOAD_ARGS_1(x0)   \
-  long _x0tmp = (long) (x0);   \
+  long long _x0tmp = ARGIFY (x0);  \
   LOAD_ARGS_0 ()   \
   _x0 = _x0tmp;
 # define LOAD_ARGS_2(x0, x1)   \
-  long _x1tmp = (long) (x1);   \
+  long long _x1tmp = ARGIFY (x1);  \
   LOAD_ARGS_1 (x0) \
-  register long _x1 asm ("x1") = _x1tmp;
+  register long long _x1 asm ("x1") = _x1tmp;
 # define LOAD_ARGS_3(x0, x1, x2)   \
-  long _x2tmp = (long) (x2);   \
+  long long _x2tmp = ARGIFY (x2);  \
   LOAD_ARGS_2 (x0, x1) \
-  register long _x2 asm ("x2") = _x2tmp;
+  register long long _x2 asm ("x2") = _x2tmp;
 # define LOAD_ARGS_4(x0, x1, x2, x3)   \
-  long _x3tmp = (long) (x3);   \
+  long long _x3tmp = ARGIFY (x3);  \
   LOAD_ARGS_3 (x0, x1, x2) \
-

[PATCH 10/27] [AARCH64] Detect ILP32 in configure scripts.

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

This adds detecting of ILP32 to the configure scripts.
Adding to preconfigure detection of ilp32 in preconfigure and then writing out 
the default-abi in configure.

* sysdeps/aarch64/preconfigure: Detect ILP32 and set aarch64_config_abi
to ilp32 for ilp32 and lp64 for lp64.  Set machine to either
aarch64/ilp32 or aarch64/lp64 depending on the ABI that is selected.
* sysdeps/aarch64/configure.ac: Define HAVE_AARCH64_ILP32 if this is ILP32.
Set the default-abi to either ilp32, lp64, ilp32_be or lp64_be depending
on the ABI.
* sysdeps/aarch64/configure: Regenerate.
* sysdeps/unix/sysv/linux/aarch64/configure.ac: Set arch_minimum_kernel
to 3.19.0 for ILP32.
Set LIBC_SLIBDIR_RTLDDIR to libilp32/lib for ilp32.
* sysdeps/unix/sysv/linux/aarch64/configure: Regenerate.

Signed-off-by: Yury Norov 
---
 sysdeps/aarch64/configure| 15 +++
 sysdeps/aarch64/configure.ac | 11 +--
 sysdeps/aarch64/preconfigure | 11 ++-
 sysdeps/unix/sysv/linux/aarch64/configure| 22 +++---
 sysdeps/unix/sysv/linux/aarch64/configure.ac |  9 +++--
 5 files changed, 56 insertions(+), 12 deletions(-)
 mode change 100644 => 100755 sysdeps/aarch64/configure
 mode change 100644 => 100755 sysdeps/unix/sysv/linux/aarch64/configure

diff --git a/sysdeps/aarch64/configure b/sysdeps/aarch64/configure
old mode 100644
new mode 100755
index 5bd355a..7dd56e8
--- a/sysdeps/aarch64/configure
+++ b/sysdeps/aarch64/configure
@@ -163,12 +163,19 @@ rm -f conftest*
 fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_aarch64_be" >&5
 $as_echo "$libc_cv_aarch64_be" >&6; }
+
+if test $aarch64_config_abi = ilp32; then
+  $as_echo "#define HAVE_AARCH64_ILP32 1" >>confdefs.h
+
+fi
+
 if test $libc_cv_aarch64_be = yes; then
   $as_echo "#define HAVE_AARCH64_BE 1" >>confdefs.h
 
-  config_vars="$config_vars
-default-abi = lp64_be"
+  libc_aarch64_be=_be
 else
-  config_vars="$config_vars
-default-abi = lp64"
+  libc_aarch64_be=
 fi
+
+config_vars="$config_vars
+default-abi = ${aarch64_config_abi}${libc_aarch64_be}"
diff --git a/sysdeps/aarch64/configure.ac b/sysdeps/aarch64/configure.ac
index 7851dd4..9069325 100644
--- a/sysdeps/aarch64/configure.ac
+++ b/sysdeps/aarch64/configure.ac
@@ -14,9 +14,16 @@ AC_CACHE_CHECK([for big endian],
   yes
  #endif
   ], libc_cv_aarch64_be=yes, libc_cv_aarch64_be=no)])
+
+if test $aarch64_config_abi = ilp32; then
+  AC_DEFINE(HAVE_AARCH64_ILP32)
+fi
+
 if test $libc_cv_aarch64_be = yes; then
   AC_DEFINE(HAVE_AARCH64_BE)
-  LIBC_CONFIG_VAR([default-abi], [lp64_be])
+  libc_aarch64_be=_be
 else
-  LIBC_CONFIG_VAR([default-abi], [lp64])
+  libc_aarch64_be=
 fi
+
+LIBC_CONFIG_VAR([default-abi], [${aarch64_config_abi}${libc_aarch64_be}])
diff --git a/sysdeps/aarch64/preconfigure b/sysdeps/aarch64/preconfigure
index d9bd1f8..4bcd8e3 100644
--- a/sysdeps/aarch64/preconfigure
+++ b/sysdeps/aarch64/preconfigure
@@ -1,6 +1,15 @@
 case "$machine" in
 aarch64*)
base_machine=aarch64
-   machine=aarch64
+   case "$CC $CFLAGS $CPPFLAGS " in
+   *" -mabi=ilp32 "*) aarch64_config_abi=ilp32 ;;
+   *" -mabi=lp64 "*) aarch64_config_abi=lp64 ;;
+   *) aarch64_config_abi=default ;;
+   esac
+   case $aarch64_config_abi in
+   default) machine=aarch64/lp64 aarch64_config_abi=lp64 ;;
+   ilp32) machine=aarch64/ilp32 ;;
+   lp64) machine=aarch64/lp64 ;;
+   esac
;;
 esac
diff --git a/sysdeps/unix/sysv/linux/aarch64/configure 
b/sysdeps/unix/sysv/linux/aarch64/configure
old mode 100644
new mode 100755
index f48472c..2563e83
--- a/sysdeps/unix/sysv/linux/aarch64/configure
+++ b/sysdeps/unix/sysv/linux/aarch64/configure
@@ -1,9 +1,23 @@
 # This file is generated from configure.ac by Autoconf.  DO NOT EDIT!
  # Local configure fragment for sysdeps/unix/sysv/linux/aarch64.
 
-arch_minimum_kernel=3.7.0
-
-test -n "$libc_cv_slibdir" ||
+if test $aarch64_config_abi = ilp32; then
+  arch_minimum_kernel=3.19.0
+  test -n "$libc_cv_slibdir" ||
+case "$prefix" in
+/usr | /usr/)
+  libc_cv_slibdir=/libilp32
+  libc_cv_rtlddir=/lib
+  if test "$libdir" = '${exec_prefix}/lib'; then
+libdir='${exec_prefix}/libilp32';
+# Locale data can be shared between 32-bit and 64-bit libraries.
+libc_cv_localedir='${exec_prefix}/lib/locale'
+  fi
+  ;;
+esac
+else
+  arch_minimum_kernel=3.7.0
+  test -n "$libc_cv_slibdir" ||
 case "$prefix" in
 /usr | /usr/)
   libc_cv_slibdir=/lib64
@@ -15,3 +29,5 @@ case "$prefix" in
   fi
   ;;
 esac
+fi
+
diff --git a/sysdeps/unix/sysv/linux/aarch64/configure.ac 
b/sysdeps/unix/sysv/linux/aarch64/configure.ac
index 21

[PATCH 05/27] [AARCH64] Use PTR_REG in crti.S.

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

call_weak_fn loads from a pointer, so use PTR_REG so the load
is 32bits for ILP32.

* sysdeps/aarch64/crti.S: Include sysdep.h
(call_weak_fn): Use PTR_REG when loading from
PREINIT_FUNCTION.

AARCH64: Make RTLD_START paramatizable

Make RTLD_START paramatizable for ILP32 and LP64 usage and provides common
code between ILP32.

* sysdeps/aarch64/dl-machine.h (RTLD_START): Rename to ...
(RTLD_START_1): This and add PTR, PTR_SIZE_LOG, and PTR_SP arguments.
(RTLD_START): New macro which uses RTLD_START_1.

Signed-off-by: Yury Norov 
---
 sysdeps/aarch64/crti.S   |   3 +-
 sysdeps/aarch64/dl-machine.h | 128 ++-
 2 files changed, 69 insertions(+), 62 deletions(-)

diff --git a/sysdeps/aarch64/crti.S b/sysdeps/aarch64/crti.S
index 53ccb42..6e149b5 100644
--- a/sysdeps/aarch64/crti.S
+++ b/sysdeps/aarch64/crti.S
@@ -39,6 +39,7 @@
they can be called as functions.  The symbols _init and _fini are
magic and cause the linker to emit DT_INIT and DT_FINI.  */
 
+#include 
 #include 
 
 #ifndef PREINIT_FUNCTION
@@ -60,7 +61,7 @@
.type   call_weak_fn, %function
 call_weak_fn:
adrpx0, :got:PREINIT_FUNCTION
-   ldr x0, [x0, #:got_lo12:PREINIT_FUNCTION]
+   ldr PTR_REG(0), [x0, #:got_lo12:PREINIT_FUNCTION]
cbz x0, 1f
b   PREINIT_FUNCTION
 1:
diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
index a16cb11..7a49852 100644
--- a/sysdeps/aarch64/dl-machine.h
+++ b/sysdeps/aarch64/dl-machine.h
@@ -126,68 +126,74 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, 
int profile)
 /* Initial entry point for the dynamic linker. The C function
_dl_start is the real entry point, its return value is the user
program's entry point */
+#ifdef __LP64__
+#define RTLD_START RTLD_START_1("x", "3", "sp")
+#else
+#define RTLD_START RTLD_START_1("w", "2", "wsp")
+#endif
+
 
-#define RTLD_START asm ("\
-.text  \n\
-.globl _start  \n\
-.type _start, %function\n\
-.globl _dl_start_user  \n\
-.type _dl_start_user, %function\n\
-_start:\n\
-   mov x0, sp  \n\
-   bl  _dl_start   \n\
-   // returns user entry point in x0   \n\
-   mov x21, x0 \n\
-_dl_start_user:\n\
-   // get the original arg count   \n\
-   ldr x1, [sp]\n\
-   // get the argv address \n\
-   add x2, sp, #8  \n\
-   // get _dl_skip_args to see if we were  \n\
-   // invoked as an executable \n\
-   adrpx4, _dl_skip_args   \n\
-ldrw4, [x4, #:lo12:_dl_skip_args]  \n\
-   // do we need to adjust argc/argv   \n\
-cmpw4, 0   \n\
-   beq .L_done_stack_adjust\n\
-   // subtract _dl_skip_args from original arg count   \n\
-   sub x1, x1, x4  \n\
-   // store adjusted argc back to stack\n\
-   str x1, [sp]\n\
-   // find the first unskipped argument\n\
-   mov x3, x2  \n\
-   add x4, x2, x4, lsl #3  \n\
-   // shuffle argv down\n\
-1: ldr x5, [x4], #8\n\
-   str x5, [x3], #8\n\
-   cmp x5, #0  \n\
-   bne 1b  \n\
-   // shuffle envp down\n\
-1: ldr x5, [x4], #8\n\
-   str x5, [x3], #8\n\
-   cmp x5, #0  \n\
-   bne 1b  \n\
-   // shuffle auxv down\n\
-1: ldp x0, x5, [x4, #16]!  \n\
-   stp x0, x5, [x3], #16   \n\
-   cmp x0, #0  

[PATCH 06/27] [AARCH64] Use PTR_REG/PTR_SIZE/PTR_SIZE_LOG in dl-tlsesc.S

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

This converts dl-tlsdesc.S code over to use the new macros which allows for
sharing between ILP32 and LP64 code.

* sysdeps/aarch64/dl-tlsdesc.S (_dl_tlsdesc_return): Use PTR_REG.
(_dl_tlsdesc_undefweak): Use PTR_REG, PTR_SIZE.
(_dl_tlsdesc_dynamic): Likewise.
(_dl_tlsdesc_resolve_rela): Likewise.
(_dl_tlsdesc_resolve_hold): Likewise.

Signed-off-by: Yury Norov 
---
 sysdeps/aarch64/dl-tlsdesc.S | 36 ++--
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S
index 05be370..fe8a17d 100644
--- a/sysdeps/aarch64/dl-tlsdesc.S
+++ b/sysdeps/aarch64/dl-tlsdesc.S
@@ -74,7 +74,7 @@
cfi_startproc
.align 2
 _dl_tlsdesc_return:
-   ldr x0, [x0, #8]
+   ldr PTR_REG (0), [x0, #PTR_SIZE]
RET
cfi_endproc
.size   _dl_tlsdesc_return, .-_dl_tlsdesc_return
@@ -126,9 +126,9 @@ _dl_tlsdesc_undefweak:
   td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
   from [x0,#8] here happens after the initialization of td->arg.  */
ldarxzr, [x0]
-   ldr x0, [x0, #8]
+   ldr PTR_REG (0), [x0, #PTR_SIZE]
mrs x1, tpidr_el0
-   sub x0, x0, x1
+   sub PTR_REG (0), PTR_REG (0), PTR_REG (1)
ldr x1, [sp], #16
cfi_adjust_cfa_offset (-16)
RET
@@ -189,20 +189,20 @@ _dl_tlsdesc_dynamic:
   td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
   from [x0,#8] here happens after the initialization of td->arg.  */
ldarxzr, [x0]
-   ldr x1, [x0,#8]
-   ldr x0, [x4]
-   ldr x3, [x1,#16]
-   ldr x2, [x0]
-   cmp x3, x2
+   ldr PTR_REG (1), [x0,#PTR_SIZE]
+   ldr PTR_REG (0), [x4]
+   ldr PTR_REG (3), [x1,#(PTR_SIZE * 2)]
+   ldr PTR_REG (2), [x0]
+   cmp PTR_REG (3), PTR_REG (2)
b.hi2f
-   ldr x2, [x1]
-   add x0, x0, x2, lsl #4
-   ldr x0, [x0]
+   ldr PTR_REG (2), [x1]
+   add PTR_REG (0), PTR_REG (0), PTR_REG (2), lsl #(PTR_LOG_SIZE + 1)
+   ldr PTR_REG (0), [x0]
cmn x0, #0x1
b.eq2f
-   ldr x1, [x1,#8]
-   add x0, x0, x1
-   sub x0, x0, x4
+   ldr PTR_REG (1), [x1,#(PTR_SIZE * 2)]
+   add PTR_REG (0), PTR_REG (0), PTR_REG (1)
+   sub PTR_REG (0), PTR_REG (0), PTR_REG (4)
 1:
ldp  x1,  x2, [sp, #32+16*0]
ldp  x3,  x4, [sp, #32+16*1]
@@ -233,7 +233,7 @@ _dl_tlsdesc_dynamic:
bl  __tls_get_addr
 
mrs x1, tpidr_el0
-   sub x0, x0, x1
+   sub PTR_REG (0), PTR_REG (0), PTR_REG (1)
 
RESTORE_Q_REGISTERS
 
@@ -279,13 +279,13 @@ _dl_tlsdesc_resolve_rela:
 
SAVE_Q_REGISTERS
 
-   ldr x1, [x3, #8]
+   ldr PTR_REG (1), [x3, #PTR_SIZE]
bl  _dl_tlsdesc_resolve_rela_fixup
 
RESTORE_Q_REGISTERS
 
ldr x0, [sp, #32+16*8]
-   ldr x1, [x0]
+   ldr PTR_REG (1), [x0]
blr x1
 
ldp  x1,  x4, [sp, #32+16*0]
@@ -346,7 +346,7 @@ _dl_tlsdesc_resolve_hold:
RESTORE_Q_REGISTERS
 
ldr x0, [sp, #32+16*9]
-   ldr x1, [x0]
+   ldr PTR_REG (1), [x0]
blr x1
 
ldp  x1,  x2, [sp, #32+16*0]
-- 
2.7.4



[PATCH 09/27] [AARCH64] Use PTR_REG in getcontext.S.

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

Just like the other patches, this patch allows for getcontext.S to be used
between ILP32 and LP64.

* sysdeps/unix/sysv/linux/aarch64/getcontext.S: Use PTR_REG when
doing an add so wrapping of the pointer is correct for ILP32.

Signed-off-by: Yury Norov 
---
 sysdeps/unix/sysv/linux/aarch64/getcontext.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sysdeps/unix/sysv/linux/aarch64/getcontext.S 
b/sysdeps/unix/sysv/linux/aarch64/getcontext.S
index c2dd5b8..71e526c 100644
--- a/sysdeps/unix/sysv/linux/aarch64/getcontext.S
+++ b/sysdeps/unix/sysv/linux/aarch64/getcontext.S
@@ -90,7 +90,7 @@ ENTRY(__getcontext)
 
/* Grab the signal mask */
/* rt_sigprocmask (SIG_BLOCK, NULL, &ucp->uc_sigmask, _NSIG8) */
-   add x2, x0, #UCONTEXT_SIGMASK
+   add PTR_REG (2), PTR_REG (0), #UCONTEXT_SIGMASK
mov x0, SIG_BLOCK
mov x1, 0
mov x3, _NSIG8
-- 
2.7.4



[PATCH 27/27] Fix PTRDIFF_MIN/PTRDIFF_MIN and PTRDIFF_MIN for ILP32.

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

Signed-off-by: Andrew Pinski 
Signed-off-by: Yury Norov 
---
 sysdeps/aarch64/bits/wordsize.h | 8 +++-
 sysdeps/generic/stdint.h| 9 +++--
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/sysdeps/aarch64/bits/wordsize.h b/sysdeps/aarch64/bits/wordsize.h
index 3d5a79d..18697e2 100644
--- a/sysdeps/aarch64/bits/wordsize.h
+++ b/sysdeps/aarch64/bits/wordsize.h
@@ -19,12 +19,10 @@
 # define __WORDSIZE64
 #else
 # define __WORDSIZE32
+# define __WORDSIZE32_SIZE_ULONG   1
+# define __WORDSIZE32_PTRDIFF_LONG 1
 #endif
 
-/* LP64 and ILP32s ABI uses a 64bit time_t.
-   This allows aarch32 and AARCH64 applications
+/* This allows ILP32 and AARCH64 applications
both access utmp. */
 #define __WORDSIZE_TIME64_COMPAT32 1
-
-/* LP64 and ILP32 use the 64bit system call interface. */
-#define __SYSCALL_WORDSIZE 64
diff --git a/sysdeps/generic/stdint.h b/sysdeps/generic/stdint.h
index 4427627..05f895a 100644
--- a/sysdeps/generic/stdint.h
+++ b/sysdeps/generic/stdint.h
@@ -248,8 +248,13 @@ typedef unsigned long long int uintmax_t;
 #  define PTRDIFF_MIN  (-9223372036854775807L-1)
 #  define PTRDIFF_MAX  (9223372036854775807L)
 # else
-#  define PTRDIFF_MIN  (-2147483647-1)
-#  define PTRDIFF_MAX  (2147483647)
+#  ifdef __WORDSIZE32_PTRDIFF_LONG
+#define PTRDIFF_MIN(-2147483647L-1)
+#define PTRDIFF_MAX(2147483647L)
+#  else
+#define PTRDIFF_MIN(-2147483647-1)
+#define PTRDIFF_MAX(2147483647)
+#  endif
 # endif
 
 /* Limits of `sig_atomic_t'.  */
-- 
2.7.4



[PATCH 26/27] [AARCH64] Change type of __align to long long

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

So that ILP32 is aligned to 64bits.

Signed-off-by: Yury Norov 
---
 sysdeps/aarch64/nptl/bits/semaphore.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sysdeps/aarch64/nptl/bits/semaphore.h 
b/sysdeps/aarch64/nptl/bits/semaphore.h
index 3cc5b37..3fe6047 100644
--- a/sysdeps/aarch64/nptl/bits/semaphore.h
+++ b/sysdeps/aarch64/nptl/bits/semaphore.h
@@ -31,5 +31,5 @@
 typedef union
 {
   char __size[__SIZEOF_SEM_T];
-  long int __align;
+  long long int __align;
 } sem_t;
-- 
2.7.4



[RFC PATCH 00/27] ARM64: support ILP32

2016-06-20 Thread Yury Norov
This series enables aarch64 port with ilp32 mode.

After long discussions in kernel list, we finally got
consensus on how ABI should look. This patchset adds
support for the ABI in GLIBC. It is tested with LTP 
with no big regressions comparing to LP64 and AARCH32.

Though it's very raw. Please be patient reviewing it.

ABI details:
 - types are taken from AARCH32, next types turned to 64-bit,
   as modern requirement for new APIs tells:
ino_t  is  u64 type
off_t  is  s64 type
blkcnt_t   is  s64 type
fsblkcnt_t is  u64 type
fsfilcnt_t is  u64 type
 - 64-bit arguments are passed in syscall as register pair,
   as kernel internally clears top halves for all input regs;
 - standard syscall table is used;
 - 32-bit time_t is used. AARCH64/ILP32 is waiting for general
   fix of Y2038 problem just like other 32-bit arches;
 - stat{64}, statfs{64} structures are of the identical layout
   with LP64. Corresponding syscalls are taken from 64-bit code.

Links:
This series: https://github.com/norov/glibc/commits/ilp32-2.23
Kernel series: https://github.com/norov/linux/commits/ilp32-nowrap
Kernel in LKML: https://lkml.org/lkml/2016/6/17/990

Please review it. Any comments appreciated.

Yury.

Andrew Pinski (24):
  [AARCH64] Fix utmp struct for compatibility reasons.
  [AARCH64] Add header guards to sysdep.h headers.
  Add dynamic ILP32 AARCH64 relocations to elf.h
  [AARCH64] Add PTR_REG, PTR_LOG_SIZE, and PTR_SIZE.  Use it in
LDST_PCREL and LDST_GLOBAL.
  [AARCH64] Use PTR_REG in crti.S.
  [AARCH64] Use PTR_REG/PTR_SIZE/PTR_SIZE_LOG in dl-tlsesc.S
  [AARCH64] Use PTR_* macros in dl-trampoline.S
  [AARCH64] Use PTR_* in start.S
  [AARCH64] Use PTR_REG in getcontext.S.
  [AARCH64] Detect ILP32 in configure scripts.
  [AARCH64] Syscalls for ILP32 are passed always via 64bit values.
  [AARCH64] Add ILP32 support to elf_machine_load_address.
  [AARCH64] Set up wordsize for ILP32.
  [AARCH64] Add ILP32 to makefiles
  [AARCH64] Add support to ldconfig for ILP32 and libilp32
  [AARCH64] Add ILP32 ld.so to the known interpreter names.
  [AARCH64] Add ldd-rewrite.sed so that ilp32 ld.so can be found
  [AARCH64] Add kernel_sigaction.h for AARCH64 ILP32
  [AARCH64] Add typesizes.h for ILP32
  [AARCH64] Make lp64 and ilp32 directories.
  Add support for AT_ARM64_MIDR.
  [AARCH64] Fix ILP32 warning
  [AARCH64] Change type of __align to long long
  Fix PTRDIFF_MIN/PTRDIFF_MIN and PTRDIFF_MIN for ILP32.

Yury Norov (3):
  [AARCH64] ILP32: introduce syscalls that pass off_t
  [AARCH64] ILP32: support stat syscall family
  [AARCH64] delouse input arguments in system functions

 elf/cache.c|   2 +
 elf/dl-sysdep.c|   1 +
 elf/elf.h  |   3 +
 sysdeps/aarch64/Implies|   6 -
 sysdeps/aarch64/__longjmp.S|   6 +-
 sysdeps/aarch64/bits/wordsize.h|  28 +++
 sysdeps/aarch64/configure  |  15 +-
 sysdeps/aarch64/configure.ac   |  11 +-
 sysdeps/aarch64/crti.S |   3 +-
 sysdeps/aarch64/dl-irel.h  |   3 +-
 sysdeps/aarch64/dl-machine.h   | 199 -
 sysdeps/aarch64/dl-tlsdesc.S   |  42 +++--
 sysdeps/aarch64/dl-trampoline.S|  18 +-
 sysdeps/aarch64/ilp32/Implies  |   6 +
 sysdeps/aarch64/jmpbuf-unwind.h|   2 +-
 sysdeps/aarch64/lp64/Implies   |   7 +
 sysdeps/aarch64/memcmp.S   |   3 +
 sysdeps/aarch64/memcpy.S   |   4 +-
 sysdeps/aarch64/memmove.S  |   3 +
 sysdeps/aarch64/memset.S   |   3 +-
 sysdeps/aarch64/nptl/bits/semaphore.h  |   2 +-
 sysdeps/aarch64/preconfigure   |  11 +-
 sysdeps/aarch64/setjmp.S   |   5 +-
 sysdeps/aarch64/start.S|  20 ++-
 sysdeps/aarch64/strchr.S   |   1 +
 sysdeps/aarch64/strchrnul.S|   1 +
 sysdeps/aarch64/strcmp.S   |   2 +
 sysdeps/aarch64/strcpy.S   |   2 +
 sysdeps/aarch64/strlen.S   |   2 +
 sysdeps/aarch64/strncmp.S  |   3 +
 sysdeps/aarch64/strnlen.S  |   3 +
 sysdeps/aarch64/strrchr.S  |   1 +
 sysdeps/aarch64/sysdep.h   |  39 +++-
 sysdeps/generic/ldconfig.h |   1 +
 sysdeps/generic/stdint.h   |   9 +-
 sysdeps/unix/sysv/linux/aarch64/Implies|   2 -
 sysdeps/unix/sysv/linux/aarch64/Makefile   |  16 +-
 sysdeps/unix/sysv/linux/aarch64/bits/fcntl.h   |   6 +-
 sysdeps/unix/sysv/

[PATCH 13/27] [AARCH64] Set up wordsize for ILP32.

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

__WORDSIZE needs to be set to 32 for ILP32.

* sysdeps/aarch64/bits/wordsize.h (__WORDSIZE): Set to 32 for ILP32.
Update comments.

Signed-off-by: Yury Norov 
---
 sysdeps/aarch64/bits/wordsize.h | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/sysdeps/aarch64/bits/wordsize.h b/sysdeps/aarch64/bits/wordsize.h
index 3ecccaa..3d5a79d 100644
--- a/sysdeps/aarch64/bits/wordsize.h
+++ b/sysdeps/aarch64/bits/wordsize.h
@@ -15,12 +15,16 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>.  */
 
-#define __WORDSIZE 64
+#ifdef __LP64__
+# define __WORDSIZE64
+#else
+# define __WORDSIZE32
+#endif
 
-/* LP64 ABI has a 64bit time_t.
+/* LP64 and ILP32s ABI uses a 64bit time_t.
This allows aarch32 and AARCH64 applications
both access utmp. */
 #define __WORDSIZE_TIME64_COMPAT32 1
 
-/* LP64 use the 64bit system call interface. */
+/* LP64 and ILP32 use the 64bit system call interface. */
 #define __SYSCALL_WORDSIZE 64
-- 
2.7.4



[PATCH 21/27] [AARCH64] ILP32: introduce syscalls that pass off_t

2016-06-20 Thread Yury Norov
From: Yury Norov 

ILP32 has 64-bit off_t, to follow modern requirements.
But kernel clears top-halves of input registers. It means
we have to pass corresponding arguments in a pair, like
aarch32 does. In this patch all affected syscalls are redefined.
Most of them are taken from arm code.

Signed-off-by: Yury Norov 
---
 sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c  | 31 +++
 .../unix/sysv/linux/aarch64/ilp32/fallocate64.c|  1 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate.c  |  1 +
 .../unix/sysv/linux/aarch64/ilp32/ftruncate64.c|  4 +++
 sysdeps/unix/sysv/linux/aarch64/ilp32/llseek.c |  1 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/lseek.c  | 36 ++
 sysdeps/unix/sysv/linux/aarch64/ilp32/mmap.c   |  1 +
 .../unix/sysv/linux/aarch64/ilp32/posix_fadvise.c  |  1 +
 .../sysv/linux/aarch64/ilp32/posix_fadvise64.c |  2 ++
 sysdeps/unix/sysv/linux/aarch64/ilp32/pread.c  |  1 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/pread64.c|  5 +++
 sysdeps/unix/sysv/linux/aarch64/ilp32/pwrite.c |  1 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/pwrite64.c   |  5 +++
 sysdeps/unix/sysv/linux/aarch64/ilp32/readahead.c  |  1 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/truncate.c   |  1 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/truncate64.c |  4 +++
 16 files changed, 96 insertions(+)
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/llseek.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/lseek.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/mmap.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/posix_fadvise.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/posix_fadvise64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/pread.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/pread64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/pwrite.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/pwrite64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/readahead.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/truncate.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/truncate64.c

diff --git a/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c 
b/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c
new file mode 100644
index 000..4951d06
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c
@@ -0,0 +1,31 @@
+/* Copyright (C) 2007-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include 
+#include 
+#include 
+
+
+/* Reserve storage for the data of the file associated with FD.  */
+int
+fallocate (int fd, int mode, __off_t offset, __off_t len)
+{
+  return SYSCALL_CANCEL (fallocate, fd, mode,
+__LONG_LONG_PAIR (offset >> 32, offset),
+__LONG_LONG_PAIR (len >> 32, len));
+}
+weak_alias (fallocate, fallocate64)
diff --git a/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate64.c 
b/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate64.c
new file mode 100644
index 000..f27735a
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate64.c
@@ -0,0 +1 @@
+/* See sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c */
diff --git a/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate.c 
b/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate.c
new file mode 100644
index 000..fb5b598
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate.c
@@ -0,0 +1 @@
+/* See sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncatei64.c */
diff --git a/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate64.c 
b/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate64.c
new file mode 100644
index 000..bd0f5fe
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate64.c
@@ -0,0 +1,4 @@
+#include 
+
+weak_alias (__ftruncate64, __ftruncate)
+weak_alias (__ftruncate64, ftruncate)
diff --git a/sysdeps/unix/sysv/linux/aarch64/ilp32/llseek.c 
b/sysdeps/unix/sysv/linux/aarch64/ilp32/llseek.c
ne

[PATCH 23/27] [AARCH64] delouse input arguments in system functions

2016-06-20 Thread Yury Norov
Signed-off-by: Yury Norov 
---
 sysdeps/aarch64/__longjmp.S   | 2 ++
 sysdeps/aarch64/dl-tlsdesc.S  | 6 ++
 sysdeps/aarch64/memcmp.S  | 3 +++
 sysdeps/aarch64/memcpy.S  | 4 +++-
 sysdeps/aarch64/memmove.S | 3 +++
 sysdeps/aarch64/memset.S  | 3 ++-
 sysdeps/aarch64/setjmp.S  | 1 +
 sysdeps/aarch64/strchr.S  | 1 +
 sysdeps/aarch64/strchrnul.S   | 1 +
 sysdeps/aarch64/strcmp.S  | 2 ++
 sysdeps/aarch64/strcpy.S  | 2 ++
 sysdeps/aarch64/strlen.S  | 2 ++
 sysdeps/aarch64/strncmp.S | 3 +++
 sysdeps/aarch64/strnlen.S | 3 +++
 sysdeps/aarch64/strrchr.S | 1 +
 sysdeps/aarch64/sysdep.h  | 4 +++-
 sysdeps/unix/sysv/linux/aarch64/clone.S   | 7 +++
 sysdeps/unix/sysv/linux/aarch64/getcontext.S  | 1 +
 sysdeps/unix/sysv/linux/aarch64/setcontext.S  | 1 +
 sysdeps/unix/sysv/linux/aarch64/swapcontext.S | 1 +
 20 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/sysdeps/aarch64/__longjmp.S b/sysdeps/aarch64/__longjmp.S
index 58332be..0377715 100644
--- a/sysdeps/aarch64/__longjmp.S
+++ b/sysdeps/aarch64/__longjmp.S
@@ -46,6 +46,8 @@ ENTRY (__longjmp)
cfi_offset(d14, JB_D14<<3)
cfi_offset(d15, JB_D15<<3)
 
+   DELOUSE(0)
+
ldp x19, x20, [x0, #JB_X19<<3]
ldp x21, x22, [x0, #JB_X21<<3]
ldp x23, x24, [x0, #JB_X23<<3]
diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S
index fe8a17d..718dddf 100644
--- a/sysdeps/aarch64/dl-tlsdesc.S
+++ b/sysdeps/aarch64/dl-tlsdesc.S
@@ -74,6 +74,7 @@
cfi_startproc
.align 2
 _dl_tlsdesc_return:
+   DELOUSE(0)
ldr PTR_REG (0), [x0, #PTR_SIZE]
RET
cfi_endproc
@@ -126,6 +127,7 @@ _dl_tlsdesc_undefweak:
   td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
   from [x0,#8] here happens after the initialization of td->arg.  */
ldarxzr, [x0]
+   DELOUSE(0)
ldr PTR_REG (0), [x0, #PTR_SIZE]
mrs x1, tpidr_el0
sub PTR_REG (0), PTR_REG (0), PTR_REG (1)
@@ -174,6 +176,7 @@ _dl_tlsdesc_dynamic:
stp x29, x30, [sp,#-(32+16*NSAVEXREGPAIRS)]!
cfi_adjust_cfa_offset (32+16*NSAVEXREGPAIRS)
mov x29, sp
+   DELOUSE(0)
 
/* Save just enough registers to support fast path, if we fall
   into slow path we will save additional registers.  */
@@ -279,12 +282,14 @@ _dl_tlsdesc_resolve_rela:
 
SAVE_Q_REGISTERS
 
+   DELOUSE(3)
ldr PTR_REG (1), [x3, #PTR_SIZE]
bl  _dl_tlsdesc_resolve_rela_fixup
 
RESTORE_Q_REGISTERS
 
ldr x0, [sp, #32+16*8]
+   DELOUSE(0)
ldr PTR_REG (1), [x0]
blr x1
 
@@ -346,6 +351,7 @@ _dl_tlsdesc_resolve_hold:
RESTORE_Q_REGISTERS
 
ldr x0, [sp, #32+16*9]
+   DELOUSE(0)
ldr PTR_REG (1), [x0]
blr x1
 
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index ae2d997..982aa02 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -47,6 +47,9 @@
 #define mask   x13
 
 ENTRY_ALIGN (memcmp, 6)
+   DELOUSE(0)
+   DELOUSE(1)
+   DELOUSE(2)
cbz limit, L(ret0)
eor tmp1, src1, src2
tst tmp1, #7
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 442f390..e0bbbf8 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -46,7 +46,9 @@
 #include 
 
 ENTRY_ALIGN (memcpy, 6)
-
+   DELOUSE(0)
+   DELOUSE(1)
+   DELOUSE(2)
mov dst, dstin
cmp count, #64
b.geL(cpy_not_short)
diff --git a/sysdeps/aarch64/memmove.S b/sysdeps/aarch64/memmove.S
index dd91db0..3f72dea 100644
--- a/sysdeps/aarch64/memmove.S
+++ b/sysdeps/aarch64/memmove.S
@@ -46,6 +46,9 @@
 #define D_hx14
 
 ENTRY_ALIGN (memmove, 6)
+   DELOUSE(0)
+   DELOUSE(1)
+   DELOUSE(2)
 
cmp dstin, src
b.loL(downwards)
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index e49f4d6..e8eed9e 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -52,7 +52,8 @@
 #define tmp3w  w9
 
 ENTRY_ALIGN (__memset, 6)
-
+   DELOUSE(0)
+   DELOUSE(2)
mov dst, dstin  /* Preserve return value.  */
andsA_lw, val, #255
 #ifndef DONT_USE_DC
diff --git a/sysdeps/aarch64/setjmp.S b/sysdeps/aarch64/setjmp.S
index da83f19..d608660 100644
--- a/sysdeps/aarch64/setjmp.S
+++ b/sysdeps/aarch64/setjmp.S
@@ -33,6 +33,7 @@ END (_setjmp)
 libc_hidden_def (_setjmp)
 
 ENTRY (__sigsetjmp)
+   DELOUSE(0)
 
 1:
stp x19, x20, [x0, #JB_X19<<3]
diff 

[PATCH 12/27] [AARCH64] Add ILP32 support to elf_machine_load_address.

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

This adds ILP32 support to elf_machine_load_address.
Since elf_machine_load_address depends on the static address being
found without relocations, we need to use 16bit relocation which gets
resolved at link time for ILP32.  This is just like how the 32bit
relocation gets resolved at link time for LP64.

* sysdeps/aarch64/dl-machine.h (elf_machine_load_address): Add support
for ILP32.

Signed-off-by: Yury Norov 
---
 sysdeps/aarch64/dl-machine.h | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
index 7a49852..771b0c6 100644
--- a/sysdeps/aarch64/dl-machine.h
+++ b/sysdeps/aarch64/dl-machine.h
@@ -54,19 +54,33 @@ elf_machine_load_address (void)
  by constructing a non GOT reference to the symbol, the dynamic
  address of the symbol we compute using adrp/add to compute the
  symbol's address relative to the PC.
- This depends on 32bit relocations being resolved at link time
- and that the static address fits in the 32bits.  */
+ This depends on 32/16bit relocations being resolved at link time
+ and that the static address fits in the 32/16 bits.  */
 
   ElfW(Addr) static_addr;
   ElfW(Addr) dynamic_addr;
 
   asm ("   \n"
 "  adrp%1, _dl_start;  \n"
+#ifdef __LP64__
 "  add %1, %1, #:lo12:_dl_start\n"
+#else
+"  add %w1, %w1, #:lo12:_dl_start  \n"
+#endif
 "  ldr %w0, 1f \n"
 "  b   2f  \n"
 "1:\n"
+#ifdef __LP64__
 "  .word   _dl_start   \n"
+#else
+# ifdef __AARCH64EB__
+"  .short  0   \n"
+# endif
+"  .short  _dl_start   \n"
+# ifndef __AARCH64EB__
+"  .short  0   \n"
+# endif
+#endif
 "2:\n"
 : "=r" (static_addr),  "=r" (dynamic_addr));
   return dynamic_addr - static_addr;
-- 
2.7.4



[PATCH 17/27] [AARCH64] Add ldd-rewrite.sed so that ilp32 ld.so can be found

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

To be support multi-lib with ldd, we need to add a ldd-rewrite.sed file
to rewrite RTLDLIST to include both ld.so's.

* sysdeps/unix/sysv/linux/aarch64/configure.ac (ldd_rewrite_script):
Set.
* sysdeps/unix/sysv/linux/aarch64/configure: Regenerate.
* sysdeps/unix/sysv/linux/aarch64/ldd-rewrite.sed: New file.

Signed-off-by: Yury Norov 
---
 sysdeps/unix/sysv/linux/aarch64/configure   | 2 ++
 sysdeps/unix/sysv/linux/aarch64/configure.ac| 2 ++
 sysdeps/unix/sysv/linux/aarch64/ldd-rewrite.sed | 1 +
 3 files changed, 5 insertions(+)
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ldd-rewrite.sed

diff --git a/sysdeps/unix/sysv/linux/aarch64/configure 
b/sysdeps/unix/sysv/linux/aarch64/configure
index 2563e83..325422f 100755
--- a/sysdeps/unix/sysv/linux/aarch64/configure
+++ b/sysdeps/unix/sysv/linux/aarch64/configure
@@ -31,3 +31,5 @@ case "$prefix" in
 esac
 fi
 
+ldd_rewrite_script=$dir/ldd-rewrite.sed
+
diff --git a/sysdeps/unix/sysv/linux/aarch64/configure.ac 
b/sysdeps/unix/sysv/linux/aarch64/configure.ac
index 6526816..3ca8ed1 100644
--- a/sysdeps/unix/sysv/linux/aarch64/configure.ac
+++ b/sysdeps/unix/sysv/linux/aarch64/configure.ac
@@ -9,3 +9,5 @@ else
   LIBC_SLIBDIR_RTLDDIR([lib64], [lib])
 fi
 
+ldd_rewrite_script=$dir/ldd-rewrite.sed
+
diff --git a/sysdeps/unix/sysv/linux/aarch64/ldd-rewrite.sed 
b/sysdeps/unix/sysv/linux/aarch64/ldd-rewrite.sed
new file mode 100644
index 000..2f3bbb9
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/ldd-rewrite.sed
@@ -0,0 +1 @@
+s_^\(RTLDLIST=\)\(.*lib/\)\([^/]*\)\(-aarch64\)\(\|\_be\)\(\|\_ilp32\)\(.so\.[0-9.]*\)$_\1"\2\3-aarch64\5\7
 \2\3-aarch64\5\_ilp32\7"_
-- 
2.7.4



[PATCH 19/27] [AARCH64] Add typesizes.h for ILP32

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

The generic typesizes does not work for ILP32 as the kernel long type
needs to be long long (quad).  time_t, off_t, clock_t, suseconds_t,
ino_t, rlim_t are 64bits.
FDSET bitmask is a 64bit type.

* sysdeps/unix/sysv/linux/aarch64/bits/typesizes.h: New file.

Signed-off-by: Yury Norov 
---
 sysdeps/unix/sysv/linux/aarch64/bits/typesizes.h | 79 
 sysdeps/unix/sysv/linux/sysdep-vdso.h|  4 +-
 2 files changed, 81 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/bits/typesizes.h

diff --git a/sysdeps/unix/sysv/linux/aarch64/bits/typesizes.h 
b/sysdeps/unix/sysv/linux/aarch64/bits/typesizes.h
new file mode 100644
index 000..844da49
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/bits/typesizes.h
@@ -0,0 +1,79 @@
+/* bits/typesizes.h -- underlying types for *_t.  Linux/AARCh64 version.
+   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _BITS_TYPES_H
+# error "Never include  directly; use  instead."
+#endif
+
+#ifndef_BITS_TYPESIZES_H
+#define_BITS_TYPESIZES_H   1
+
+/* See  for the meaning of these macros.  This file exists so
+   that  need not vary across different GNU platforms.  */
+
+#define __DEV_T_TYPE   __UQUAD_TYPE
+#define __UID_T_TYPE   __U32_TYPE
+#define __GID_T_TYPE   __U32_TYPE
+#define __INO_T_TYPE   __ULONGWORD_TYPE
+#define __INO64_T_TYPE __UQUAD_TYPE
+#define __MODE_T_TYPE  __U32_TYPE
+#define __NLINK_T_TYPE __U32_TYPE
+#define __OFF_T_TYPE   __SQUAD_TYPE
+#define __OFF64_T_TYPE __SQUAD_TYPE
+#define __PID_T_TYPE   __S32_TYPE
+#define __RLIM_T_TYPE  __ULONGWORD_TYPE
+#define __RLIM64_T_TYPE__UQUAD_TYPE
+#define__BLKCNT_T_TYPE __SLONGWORD_TYPE
+#define__BLKCNT64_T_TYPE   __SQUAD_TYPE
+#define__FSBLKCNT_T_TYPE   __ULONGWORD_TYPE
+#define__FSBLKCNT64_T_TYPE __UQUAD_TYPE
+#define__FSFILCNT_T_TYPE   __ULONGWORD_TYPE
+#define__FSFILCNT64_T_TYPE __UQUAD_TYPE
+#define__FSWORD_T_TYPE __SWORD_TYPE
+#define__ID_T_TYPE __U32_TYPE
+#define __CLOCK_T_TYPE __SLONGWORD_TYPE
+#define __TIME_T_TYPE  __SLONGWORD_TYPE
+#define __USECONDS_T_TYPE  __U32_TYPE
+#define __SUSECONDS_T_TYPE __SLONGWORD_TYPE
+#define __DADDR_T_TYPE __S32_TYPE
+#define __KEY_T_TYPE   __S32_TYPE
+#define __CLOCKID_T_TYPE   __S32_TYPE
+#define __TIMER_T_TYPE void *
+#define __BLKSIZE_T_TYPE   __S32_TYPE
+#define __FSID_T_TYPE  struct { int __val[2]; }
+/* ssize_t is always singed long in both ABIs. */
+#define __SSIZE_T_TYPE __SLONGWORD_TYPE
+#define __SYSCALL_SLONG_TYPE   __SLONGWORD_TYPE
+#define __SYSCALL_ULONG_TYPE   __ULONGWORD_TYPE
+#define __CPU_MASK_TYPE __ULONGWORD_TYPE
+
+#ifdef __LP64__
+/* Tell the libc code that off_t and off64_t are actually the same type
+   for all ABI purposes, even if possibly expressed as different base types
+   for C type-checking purposes.  */
+# define __OFF_T_MATCHES_OFF64_T   1
+
+/* Same for ino_t and ino64_t.  */
+# define __INO_T_MATCHES_INO64_T   1
+#endif
+
+/* Number of descriptors that can fit in an `fd_set'.  */
+#define__FD_SETSIZE1024
+
+
+#endif /* bits/typesizes.h */
diff --git a/sysdeps/unix/sysv/linux/sysdep-vdso.h 
b/sysdeps/unix/sysv/linux/sysdep-vdso.h
index e8c4a7b..351d6bb 100644
--- a/sysdeps/unix/sysv/linux/sysdep-vdso.h
+++ b/sysdeps/unix/sysv/linux/sysdep-vdso.h
@@ -37,7 +37,7 @@
 __label__ out;   \
 __label__ iserr; \
 INTERNAL_SYSCALL_DECL (sc_err);  \
-long int sc_ret; \
+__syscall_slong_t sc_ret;\
  \
 __typeof (__vdso_##name) vdsop = __vdso_##name;  \
   

[PATCH 20/27] [AARCH64] Make lp64 and ilp32 directories.

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

The patch makes the ilp32 and lp64 have their own directory under aarch64.
Since ILP32 uses most of the same system calls as LP64 and has a 64bit
off_t, we need make the functions that end in 64 the same as the ones without.
We also need not to special case ioctl or use the already provided mmap.c file.

Signed-off-by: Yury Norov 
---
 sysdeps/aarch64/Implies|  6 --
 sysdeps/aarch64/ilp32/Implies  |  6 ++
 sysdeps/aarch64/lp64/Implies   |  7 ++
 sysdeps/unix/sysv/linux/aarch64/Implies|  2 -
 sysdeps/unix/sysv/linux/aarch64/bits/fcntl.h   |  6 +-
 sysdeps/unix/sysv/linux/aarch64/bits/typesizes.h   | 26 ++--
 sysdeps/unix/sysv/linux/aarch64/ilp32/Implies  |  4 ++
 .../unix/sysv/linux/aarch64/ilp32/dl-fxstatat64.c  |  6 ++
 sysdeps/unix/sysv/linux/aarch64/ilp32/dl-xstat64.c |  6 ++
 sysdeps/unix/sysv/linux/aarch64/ilp32/getdents.c   | 78 ++
 sysdeps/unix/sysv/linux/aarch64/ilp32/getdents64.c |  1 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/lseek64.c|  0
 sysdeps/unix/sysv/linux/aarch64/ilp32/mmap64.c |  1 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/msgctl.c | 32 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/preadv.c |  1 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/preadv64.c   |  5 ++
 sysdeps/unix/sysv/linux/aarch64/ilp32/pwritev.c|  1 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/pwritev64.c  |  5 ++
 sysdeps/unix/sysv/linux/aarch64/ilp32/readdir64.c  |  1 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/semctl.c | 53 +++
 .../unix/sysv/linux/aarch64/ilp32/shlib-versions   |  7 ++
 sysdeps/unix/sysv/linux/aarch64/ilp32/shmctl.c | 32 +
 .../unix/sysv/linux/aarch64/ilp32/syscalls.list|  0
 sysdeps/unix/sysv/linux/aarch64/ioctl.S| 31 -
 sysdeps/unix/sysv/linux/aarch64/lp64/Implies   |  4 ++
 sysdeps/unix/sysv/linux/aarch64/lp64/ioctl.S   | 31 +
 sysdeps/unix/sysv/linux/aarch64/lp64/mmap.c| 34 ++
 .../unix/sysv/linux/aarch64/lp64/shlib-versions|  7 ++
 sysdeps/unix/sysv/linux/aarch64/mmap.c | 34 --
 sysdeps/unix/sysv/linux/aarch64/shlib-versions |  7 --
 30 files changed, 346 insertions(+), 88 deletions(-)
 delete mode 100644 sysdeps/aarch64/Implies
 create mode 100644 sysdeps/aarch64/ilp32/Implies
 create mode 100644 sysdeps/aarch64/lp64/Implies
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/Implies
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/dl-fxstatat64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/dl-xstat64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/getdents.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/getdents64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/lseek64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/mmap64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/msgctl.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/preadv.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/preadv64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/pwritev.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/pwritev64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/readdir64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/semctl.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/shlib-versions
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/shmctl.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/syscalls.list
 delete mode 100644 sysdeps/unix/sysv/linux/aarch64/ioctl.S
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/lp64/Implies
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/lp64/ioctl.S
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/lp64/mmap.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/lp64/shlib-versions
 delete mode 100644 sysdeps/unix/sysv/linux/aarch64/mmap.c
 delete mode 100644 sysdeps/unix/sysv/linux/aarch64/shlib-versions

diff --git a/sysdeps/aarch64/Implies b/sysdeps/aarch64/Implies
deleted file mode 100644
index e5adf4d..000
--- a/sysdeps/aarch64/Implies
+++ /dev/null
@@ -1,6 +0,0 @@
-wordsize-64
-ieee754/ldbl-128
-ieee754/dbl-64/wordsize-64
-ieee754/dbl-64
-ieee754/flt-32
-aarch64/soft-fp
diff --git a/sysdeps/aarch64/ilp32/Implies b/sysdeps/aarch64/ilp32/Implies
new file mode 100644
index 000..705bc5b
--- /dev/null
+++ b/sysdeps/aarch64/ilp32/Implies
@@ -0,0 +1,6 @@
+aarch64
+wordsize-32
+ieee754/ldbl-128
+ieee754/dbl-64
+ieee754/flt-32
+aarch64/soft-fp
diff --git a/sysdeps/aarch64/lp64/Implies b/sysdeps/aarch64/lp64/Implies
new file mode 100644
index 000..d2fbde7
--- /dev/null
+++ b/sysdeps/aarch64/lp64/Implies
@@ -0,0 +1,7 @@
+aarch64
+wordsize-64
+ieee754/ldbl-128
+ieee754/dbl-64/wordsize-64
+ieee754/dbl-64
+ieee754/flt-32
+aarch64/soft-fp
diff --git a/sysdeps/unix/sysv/linux/aarch64/Implies 
b/sysdeps/unix/sysv/linux/aarch64

[PATCH 18/27] [AARCH64] Add kernel_sigaction.h for AARCH64 ILP32

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

In ILP32, the sigaction struct is the same as AARCH64 so we need
the header file kernel_sigaction.h.  To allow for this to work,
we use a long long fields and then add extra casts when converting
between the user exposed struct and the kernel exposed struct.

* sysdeps/unix/sysv/linux/aarch64/kernel_sigaction.h: New file.
* sysdeps/unix/sysv/linux/aarch64/sigaction.c (__libc_sigaction):
Add cast here it is necessary.

Signed-off-by: Yury Norov 
---
 sysdeps/unix/sysv/linux/aarch64/kernel_sigaction.h | 12 
 sysdeps/unix/sysv/linux/aarch64/sigaction.c| 10 ++
 2 files changed, 18 insertions(+), 4 deletions(-)
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/kernel_sigaction.h

diff --git a/sysdeps/unix/sysv/linux/aarch64/kernel_sigaction.h 
b/sysdeps/unix/sysv/linux/aarch64/kernel_sigaction.h
new file mode 100644
index 000..7b3023b
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/kernel_sigaction.h
@@ -0,0 +1,12 @@
+
+#define HAVE_SA_RESTORER
+
+/* This is the sigaction structure in aarch64 kernel.
+   Note the ILP32 struct uses the same struct as LP64
+   which is why the fields are 64bit in size. */
+struct kernel_sigaction {
+   unsigned long long k_sa_handler;
+   unsigned long long sa_flags;
+   unsigned long long sa_restorer;
+   sigset_t sa_mask;
+};
diff --git a/sysdeps/unix/sysv/linux/aarch64/sigaction.c 
b/sysdeps/unix/sysv/linux/aarch64/sigaction.c
index 3291924..40a327f 100644
--- a/sysdeps/unix/sysv/linux/aarch64/sigaction.c
+++ b/sysdeps/unix/sysv/linux/aarch64/sigaction.c
@@ -39,15 +39,17 @@ __libc_sigaction (int sig, const struct sigaction *act, 
struct sigaction *oact)
 
   if (act)
 {
-  kact.k_sa_handler = act->sa_handler;
+  kact.k_sa_handler = (unsigned long long)(uintptr_t)act->sa_handler;
   memcpy (&kact.sa_mask, &act->sa_mask, sizeof (sigset_t));
   kact.sa_flags = act->sa_flags;
 #ifdef HAVE_SA_RESTORER
   if (kact.sa_flags & SA_RESTORER)
-   kact.sa_restorer = act->sa_restorer;
+   kact.sa_restorer = (unsigned long long)(uintptr_t)act->sa_restorer;
 #endif
 }
 
+  /* This is needed for ILP32 as the structures are two different sizes due to
+ using the LP64 structure.  */
   result = INLINE_SYSCALL (rt_sigaction, 4, sig,
   act ? &kact : NULL,
   oact ? &koact : NULL, _NSIG / 8);
@@ -55,11 +57,11 @@ __libc_sigaction (int sig, const struct sigaction *act, 
struct sigaction *oact)
 {
   if (oact && result >= 0)
{
- oact->sa_handler = koact.k_sa_handler;
+ oact->sa_handler = (void*)(uintptr_t)koact.k_sa_handler;
  memcpy (&oact->sa_mask, &koact.sa_mask, sizeof (sigset_t));
  oact->sa_flags = koact.sa_flags;
 #ifdef HAVE_SA_RESTORER
- oact->sa_restorer = koact.sa_restorer;
+ oact->sa_restorer = (void*)(uintptr_t)koact.sa_restorer;
 #endif
}
 }
-- 
2.7.4



[PATCH 16/27] [AARCH64] Add ILP32 ld.so to the known interpreter names.

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

This patch adds ILP32 ld.so names to the known interpreter names.

* sysdeps/unix/sysv/linux/aarch64/ldconfig.h (SYSDEP_KNOWN_INTERPRETER_NAMES):
Add ilp32 ld.so names.

Signed-off-by: Yury Norov 
---
 sysdeps/unix/sysv/linux/aarch64/ldconfig.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sysdeps/unix/sysv/linux/aarch64/ldconfig.h 
b/sysdeps/unix/sysv/linux/aarch64/ldconfig.h
index ee91ef8..ac84194 100644
--- a/sysdeps/unix/sysv/linux/aarch64/ldconfig.h
+++ b/sysdeps/unix/sysv/linux/aarch64/ldconfig.h
@@ -21,6 +21,8 @@
 #define SYSDEP_KNOWN_INTERPRETER_NAMES \
   { "/lib/ld-linux-aarch64.so.1", FLAG_ELF_LIBC6 }, \
   { "/lib/ld-linux-aarch64_be.so.1", FLAG_ELF_LIBC6 }, \
+  { "/lib/ld-linux-aarch64_ilp32.so.1", FLAG_ELF_LIBC6 }, \
+  { "/lib/ld-linux-aarch64_be_ilp32.so.1", FLAG_ELF_LIBC6 }, \
   { "/lib/ld-linux.so.3", FLAG_ELF_LIBC6 }, \
   { "/lib/ld-linux-armhf.so.3", FLAG_ELF_LIBC6 },
 #define SYSDEP_KNOWN_LIBRARY_NAMES \
-- 
2.7.4



[PATCH 14/27] [AARCH64] Add ILP32 to makefiles

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

This patch adds ilp32 and ilp32_be as abi variants to the aarch64 linux
makefile.

* sysdeps/unix/sysv/linux/aarch64/Makefile (abi-variants): Add ilp32
and ilp32_be.
(abi-lp64-options): Add defining of LP64 and undef of ILP32 macros.
(abi-lp64-condition): Check word size macro also.
(abi-lp64_be-options): Add defining of LP64 and undef of ILP32 macros.
(abi-lp64_be-condition): Check word size macro also.
(abi-ilp32-options): Define.
(abi-ilp32-condition): Likewise.
(abi-ilp32_be-options): Define.
(abi-ilp32_be-condition): Likewise.

Signed-off-by: Yury Norov 
---
 sysdeps/unix/sysv/linux/aarch64/Makefile | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/sysdeps/unix/sysv/linux/aarch64/Makefile 
b/sysdeps/unix/sysv/linux/aarch64/Makefile
index 6b4e620..67619f5 100644
--- a/sysdeps/unix/sysv/linux/aarch64/Makefile
+++ b/sysdeps/unix/sysv/linux/aarch64/Makefile
@@ -23,13 +23,21 @@ endif
 
 abi-variants := lp64
 abi-variants += lp64_be
+abi-variants += ilp32
+abi-variants += ilp32_be
 
 ifeq (,$(filter $(default-abi),$(abi-variants)))
 Unknown ABI, must be one of $(abi-variants)
 endif
 
-abi-lp64-options := -U__AARCH64EB__
-abi-lp64-condition := !defined __AARCH64EB__
+abi-lp64-options := -U__AARCH64EB__ -D__LP64__ -U__ILP32__
+abi-lp64-condition := __WORDSIZE == 64 && !defined __AARCH64EB__
 
-abi-lp64_be-options := -D__AARCH64EB__
-abi-lp64_be-condition := defined __AARCH64EB__
+abi-lp64_be-options := -D__AARCH64EB__ -D__LP64__ -U__ILP32__
+abi-lp64_be-condition := __WORDSIZE == 64 && defined __AARCH64EB__
+
+abi-ilp32-options := -U__AARCH64EB__ -U__LP64__ -D__ILP32__
+abi-ilp32-condition := __WORDSIZE == 32 && !defined __AARCH64EB__
+
+abi-ilp32_be-options := -D__AARCH64EB__ -U__LP64__ -D__ILP32__
+abi-ilp32_be-condition := __WORDSIZE == 32 && defined __AARCH64EB__
-- 
2.7.4



[PATCH 24/27] Add support for AT_ARM64_MIDR.

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

Signed-off-by: Yury Norov 
---
 elf/dl-sysdep.c |  1 +
 elf/elf.h   |  3 +++
 sysdeps/unix/sysv/linux/aarch64/dl-auxv.h   | 25 +
 sysdeps/unix/sysv/linux/aarch64/dl-sysdep.c |  5 +
 4 files changed, 34 insertions(+)
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/dl-auxv.h
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/dl-sysdep.c

diff --git a/elf/dl-sysdep.c b/elf/dl-sysdep.c
index eaa7155..88b7dd7 100644
--- a/elf/dl-sysdep.c
+++ b/elf/dl-sysdep.c
@@ -307,6 +307,7 @@ _dl_show_auxv (void)
  [AT_SYSINFO_EHDR - 2] =   { "SYSINFO_EHDR: 0x", hex },
  [AT_RANDOM - 2] = { "RANDOM:   0x", hex },
  [AT_HWCAP2 - 2] = { "HWCAP2:   0x", hex },
+ [AT_ARM64_MIDR - 2] = { "MIDR: 0x", hex },
};
   unsigned int idx = (unsigned int) (av->a_type - 2);
 
diff --git a/elf/elf.h b/elf/elf.h
index 15f5a75..c05bc23 100644
--- a/elf/elf.h
+++ b/elf/elf.h
@@ -1058,6 +1058,9 @@ typedef struct
 #define AT_L2_CACHESHAPE   36
 #define AT_L3_CACHESHAPE   37
 
+/* AARCH64 MIDR system register. */
+#define AT_ARM64_MIDR  38
+
 /* Note section contents.  Each entry in the note section begins with
a header of a fixed form.  */
 
diff --git a/sysdeps/unix/sysv/linux/aarch64/dl-auxv.h 
b/sysdeps/unix/sysv/linux/aarch64/dl-auxv.h
new file mode 100644
index 000..290753d
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/dl-auxv.h
@@ -0,0 +1,25 @@
+/* Auxiliary vector processing for Linux/AARCH64.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+extern int __libc_arm64_midr;
+
+#define DL_PLATFORM_AUXV   \
+  case AT_ARM64_MIDR:  \
+   __libc_arm64_midr = av->a_un.a_val; \
+   break;
diff --git a/sysdeps/unix/sysv/linux/aarch64/dl-sysdep.c 
b/sysdeps/unix/sysv/linux/aarch64/dl-sysdep.c
new file mode 100644
index 000..ca3ec28
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/dl-sysdep.c
@@ -0,0 +1,5 @@
+#include "dl-auxv.h"
+
+int __libc_arm64_midr = -1;
+
+#include 
-- 
2.7.4



[PATCH 22/27] [AARCH64] ILP32: support stat syscall family

2016-06-20 Thread Yury Norov
From: Yury Norov 

stat and statfs structures has their layouts identical to lp64
after changing off_t, ino_t etc sizes to 64-bit. It means we can
pass it to kernel same way as lp64 does.

Signed-off-by: Yury Norov 
---
 sysdeps/unix/sysv/linux/aarch64/bits/stat.h| 195 +
 sysdeps/unix/sysv/linux/aarch64/bits/statfs.h  |  72 
 sysdeps/unix/sysv/linux/aarch64/ilp32/fstatfs.c|  29 +++
 sysdeps/unix/sysv/linux/aarch64/ilp32/fstatfs64.c  |  72 
 sysdeps/unix/sysv/linux/aarch64/ilp32/fxstat.c |  51 ++
 sysdeps/unix/sysv/linux/aarch64/ilp32/fxstat64.c   |  54 ++
 sysdeps/unix/sysv/linux/aarch64/ilp32/fxstatat.c   |  48 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/fxstatat64.c |  52 ++
 sysdeps/unix/sysv/linux/aarch64/ilp32/lxstat.c |  47 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/statfs.c |  30 
 sysdeps/unix/sysv/linux/aarch64/ilp32/statfs64.c   |  29 +++
 sysdeps/unix/sysv/linux/aarch64/ilp32/xstat.c  |  47 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/xstat64.c|  47 +
 13 files changed, 773 insertions(+)
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/bits/stat.h
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/bits/statfs.h
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/fstatfs.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/fstatfs64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/fxstat.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/fxstat64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/fxstatat.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/fxstatat64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/lxstat.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/statfs.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/statfs64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/xstat.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/xstat64.c

diff --git a/sysdeps/unix/sysv/linux/aarch64/bits/stat.h 
b/sysdeps/unix/sysv/linux/aarch64/bits/stat.h
new file mode 100644
index 000..eec6789
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/bits/stat.h
@@ -0,0 +1,195 @@
+/* Copyright (C) 1992-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _SYS_STAT_H && !defined _FCNTL_H
+# error "Never include  directly; use  instead."
+#endif
+
+#ifndef _BITS_STAT_H
+#define _BITS_STAT_H   1
+
+/* Versions of the `struct stat' data structure.  */
+#define _STAT_VER_KERNEL   0
+#define _STAT_VER_LINUX0
+#define _STAT_VER  _STAT_VER_KERNEL
+
+/* Versions of the `xmknod' interface.  */
+#define _MKNOD_VER_LINUX   0
+
+#ifdef __ILP32__
+#include 
+struct __kernel_timespec
+  {
+unsigned long long tv_sec; /* Seconds.  */
+long long tv_nsec; /* Nanoseconds.  */
+  };
+#define conv_timespec(u, k) do {   \
+   (u)->tv_sec = (k)->tv_sec;  \
+   (u)->tv_nsec = (k)->tv_nsec;\
+} while (0)
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define __type3264(type, name) \
+   type (name); type name##_pad
+#else
+#define __type3264(type, name) \
+   type name##_pad; type name
+#endif
+
+#else
+#define __kernel_timespec timespec
+#define conv_timespec(u, k)
+#define __type3264(type, name) type name
+#endif /* __ILP32__ */
+
+struct stat
+  {
+__dev_t st_dev;/* Device.  */
+__ino_t st_ino;/* File serial number.  */
+__mode_t st_mode;  /* File mode.  */
+__nlink_t st_nlink;/* Link count.  */
+__uid_t st_uid;/* User ID of the file's owner. */
+__gid_t st_gid;/* Group ID of the file's group.*/
+__dev_t st_rdev;   /* Device number, if device.  */
+__dev_t __pad1;
+__off_t st_size;   /* Size of file, in bytes.  */
+__blksize_t st_blksize;/* Optimal block size for I/O.  */
+int __pad2;
+
+__blkcnt_t st_blocks;  /* Number 512-byte blocks allocated. */
+#ifdef __USE_XOPEN2K8
+/* Nanosecond resolution timestamps are stored in a form

[PATCH 15/27] [AARCH64] Add support to ldconfig for ILP32 and libilp32

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

This patch adds support to ldconfig for libilp32 which is used by
ILP32.

* sysdeps/generic/ldconfig.h (FLAG_AARCH64_LIB32): New define.
* elf/cache.c (print_entry): Handle FLAG_AARCH64_LIB32.
* sysdeps/unix/sysv/linux/aarch64/dl-cache.h (_DL_CACHE_DEFAULT_ID):
Define it to be the FLAG_AARCH64_LIB32 if compiling for ILP32.
(add_system_dir): Add libilp32 to the list of system directories.
* sysdeps/unix/sysv/linux/arm/readelflib.c (process_elf_file):
Handle ILP32 elf binaries.

Signed-off-by: Yury Norov 
---
 elf/cache.c|  2 ++
 sysdeps/generic/ldconfig.h |  1 +
 sysdeps/unix/sysv/linux/aarch64/dl-cache.h | 13 -
 sysdeps/unix/sysv/linux/arm/readelflib.c   |  4 +++-
 4 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/elf/cache.c b/elf/cache.c
index fbee172..cbc6162 100644
--- a/elf/cache.c
+++ b/elf/cache.c
@@ -101,6 +101,8 @@ print_entry (const char *lib, int flag, unsigned int 
osversion,
 case FLAG_AARCH64_LIB64:
   fputs (",AArch64", stdout);
   break;
+case FLAG_AARCH64_LIB32:
+  fputs (",ILP32", stdout);
 /* Uses the ARM soft-float ABI.  */
 case FLAG_ARM_LIBSF:
   fputs (",soft-float", stdout);
diff --git a/sysdeps/generic/ldconfig.h b/sysdeps/generic/ldconfig.h
index 07600b7..8150227 100644
--- a/sysdeps/generic/ldconfig.h
+++ b/sysdeps/generic/ldconfig.h
@@ -42,6 +42,7 @@
 #define FLAG_MIPS_LIB32_NAN20080x0c00
 #define FLAG_MIPS64_LIBN32_NAN2008 0x0d00
 #define FLAG_MIPS64_LIBN64_NAN2008 0x0e00
+#define FLAG_AARCH64_LIB32 0x0f00
 
 /* Name of auxiliary cache.  */
 #define _PATH_LDCONFIG_AUX_CACHE "/var/cache/ldconfig/aux-cache"
diff --git a/sysdeps/unix/sysv/linux/aarch64/dl-cache.h 
b/sysdeps/unix/sysv/linux/aarch64/dl-cache.h
index 9c7b271..044fdba 100644
--- a/sysdeps/unix/sysv/linux/aarch64/dl-cache.h
+++ b/sysdeps/unix/sysv/linux/aarch64/dl-cache.h
@@ -18,7 +18,11 @@
 
 #include 
 
+#ifdef __LP64__
 #define _DL_CACHE_DEFAULT_ID(FLAG_AARCH64_LIB64 | FLAG_ELF_LIBC6)
+#else
+#define _DL_CACHE_DEFAULT_ID(FLAG_AARCH64_LIB32 | FLAG_ELF_LIBC6)
+#endif
 
 #define _dl_cache_check_flags(flags)\
   ((flags) == _DL_CACHE_DEFAULT_ID)
@@ -27,18 +31,25 @@
   do   \
 {  \
   size_t len = strlen (dir);   \
-  char path[len + 3];  \
+  char path[len + 6];  \
   memcpy (path, dir, len + 1); \
   if (len >= 6 && ! memcmp (path + len - 6, "/lib64", 6))  \
{   \
  len -= 2; \
  path[len] = '\0'; \
}   \
+  if (len >= 9 && ! memcmp (path + len - 9, "/libilp32", 9))\
+   {   \
+ len -= 5; \
+ path[len] = '\0'; \
+   }   \
   add_dir (path);  \
   if (len >= 4 && ! memcmp (path + len - 4, "/lib", 4))\
{   \
  memcpy (path + len, "64", 3); \
  add_dir (path);   \
+ memcpy (path + len, "ilp32", 6);  \
+ add_dir (path);   \
}   \
 } while (0)
 
diff --git a/sysdeps/unix/sysv/linux/arm/readelflib.c 
b/sysdeps/unix/sysv/linux/arm/readelflib.c
index e6ae72e..ddbad25 100644
--- a/sysdeps/unix/sysv/linux/arm/readelflib.c
+++ b/sysdeps/unix/sysv/linux/arm/readelflib.c
@@ -41,7 +41,9 @@ process_elf_file (const char *file_name, const char *lib, int 
*flag,
   ret = process_elf32_file (file_name, lib, flag, osversion, soname,
file_contents, file_length);
 
-  if (!ret && EF_ARM_EABI_VERSION (elf32_header->e_flags) == 
EF_ARM_EABI_VER5)
+  if (!ret && elf_header->e_machine == EM_AARCH64)
+   *flag = FLAG_AARCH64_LIB32|FLAG_ELF_LIBC6;
+  else if (!ret && EF_ARM_EABI_VERSION (elf32_header->e_flags) == 
EF_ARM_EABI_VER5)
{
  if (elf32_header->e_flags & EF_ARM_ABI_FLOAT_HARD)
*flag = FLAG_ARM_LIBHF|FLAG_ELF_LIBC6;
-- 
2.7.4



Re: [PATCH] no wrappers

2016-06-20 Thread Yury Norov
This patch is out of series. I sent it erroneously. Please ignore it.
On Tue, Jun 21, 2016 at 08:06:45AM +0300, Yury Norov wrote:
> Signed-off-by: Yury Norov 
> ---
>  sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c  | 31 
> ++
>  .../unix/sysv/linux/aarch64/ilp32/fallocate64.c|  1 +
>  sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate.c  |  1 +
>  .../unix/sysv/linux/aarch64/ilp32/ftruncate64.c|  4 +++
>  sysdeps/unix/sysv/linux/aarch64/ilp32/llseek.c | 22 +--
>  sysdeps/unix/sysv/linux/aarch64/ilp32/lseek.c  |  7 +++--
>  sysdeps/unix/sysv/linux/aarch64/ilp32/mmap.c   |  2 +-
>  .../unix/sysv/linux/aarch64/ilp32/posix_fadvise.c  |  1 +
>  .../sysv/linux/aarch64/ilp32/posix_fadvise64.c |  2 ++
>  sysdeps/unix/sysv/linux/aarch64/ilp32/pread.c  |  6 +
>  sysdeps/unix/sysv/linux/aarch64/ilp32/pread64.c|  6 -
>  sysdeps/unix/sysv/linux/aarch64/ilp32/pwrite.c |  6 +
>  sysdeps/unix/sysv/linux/aarch64/ilp32/pwrite64.c   |  6 -
>  sysdeps/unix/sysv/linux/aarch64/ilp32/readahead.c  |  1 +
>  sysdeps/unix/sysv/linux/aarch64/ilp32/truncate.c   |  1 +
>  sysdeps/unix/sysv/linux/aarch64/ilp32/truncate64.c |  4 +++
>  16 files changed, 65 insertions(+), 36 deletions(-)
>  create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c
>  create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate64.c
>  create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate.c
>  create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate64.c
>  create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/posix_fadvise.c
>  create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/posix_fadvise64.c
>  create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/readahead.c
>  create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/truncate.c
>  create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/truncate64.c
> 
> diff --git a/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c 
> b/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c
> new file mode 100644
> index 000..4951d06
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c
> @@ -0,0 +1,31 @@
> +/* Copyright (C) 2007-2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include 
> +#include 
> +#include 
> +
> +
> +/* Reserve storage for the data of the file associated with FD.  */
> +int
> +fallocate (int fd, int mode, __off_t offset, __off_t len)
> +{
> +  return SYSCALL_CANCEL (fallocate, fd, mode,
> +  __LONG_LONG_PAIR (offset >> 32, offset),
> +  __LONG_LONG_PAIR (len >> 32, len));
> +}
> +weak_alias (fallocate, fallocate64)
> diff --git a/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate64.c 
> b/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate64.c
> new file mode 100644
> index 000..f27735a
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate64.c
> @@ -0,0 +1 @@
> +/* See sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c */
> diff --git a/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate.c 
> b/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate.c
> new file mode 100644
> index 000..fb5b598
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate.c
> @@ -0,0 +1 @@
> +/* See sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncatei64.c */
> diff --git a/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate64.c 
> b/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate64.c
> new file mode 100644
> index 000..bd0f5fe
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate64.c
> @@ -0,0 +1,4 @@
> +#include 
> +
> +weak_alias (__ftruncate64, __ftruncate)
> +weak_alias (__ftruncate64, ftruncate)
> diff --git a/sysdeps/unix/sysv/linux/aarch64/ilp32/llseek.c 
> b/sysdeps/unix/sysv/linux/aarch64/ilp32/llseek.c
> index f1a3207..b00ca27 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/ilp32/llseek.c
> +++ b/sysdeps/unix/sys

[PATCH 01/27] [AARCH64] Fix utmp struct for compatibility reasons.

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

NOTE This is an ABI change for AARCH64.
If you have some AARCH32 and AARCH64 applications and they both use
utmp, one of them will fail due to the use of time_t inside the
utmp binary format.

This fixes the problem by setting __WORDSIZE_TIME64_COMPAT32.

* sysdeps/aarch64/bits/wordsize.h: New file.

Signed-off-by: Yury Norov 
---
 sysdeps/aarch64/bits/wordsize.h | 26 ++
 1 file changed, 26 insertions(+)
 create mode 100644 sysdeps/aarch64/bits/wordsize.h

diff --git a/sysdeps/aarch64/bits/wordsize.h b/sysdeps/aarch64/bits/wordsize.h
new file mode 100644
index 000..3ecccaa
--- /dev/null
+++ b/sysdeps/aarch64/bits/wordsize.h
@@ -0,0 +1,26 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define __WORDSIZE 64
+
+/* LP64 ABI has a 64bit time_t.
+   This allows aarch32 and AARCH64 applications
+   both access utmp. */
+#define __WORDSIZE_TIME64_COMPAT32 1
+
+/* LP64 use the 64bit system call interface. */
+#define __SYSCALL_WORDSIZE 64
-- 
2.7.4



[PATCH 03/27] Add dynamic ILP32 AARCH64 relocations to elf.h

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

elf/elf.h (R_AARCH64_P32_ABS32, R_AARCH64_P32_COPY,
R_AARCH64_P32_GLOB_DAT, R_AARCH64_P32_JUMP_SLOT,
R_AARCH64_P32_RELATIVE, R_AARCH64_P32_TLS_DTPMOD,
R_AARCH64_P32_TLS_DTPREL, R_AARCH64_P32_TLS_TPREL,
R_AARCH64_P32_TLSDESC, R_AARCH64_P32_IRELATIVE): Define.

[AARCH64] Use ELFW and ElfW macros instead of ELF64 and Elf64 names.

* sysdeps/aarch64/dl-machine.h
(elf_machine_runtime_setup): Use ElfW(Addr).
(elf_machine_rela): Use ELFW(R_TYPE).
(elf_machine_lazy_rel): Likewise.

[AARCH64] Introduce AARCH64_R so we can reuse the reloc code between ILP32 and 
LP64.

* sysdeps/aarch64/sysdep.h (AARCH64_R): Define.
* sysdeps/aarch64/dl-irel.h: Include sysdep.h
(elf_irela): Use reloc names based on AARCH64_R.
* sysdeps/aarch64/dl-machine.h: Include sysdep.h
(elf_machine_type_class): Use reloc names based on AARCH64_R.
(elf_machine_rela): Likewise.
(elf_machine_lazy_rel): Likewise.

Signed-off-by: Yury Norov 
---
 sysdeps/aarch64/dl-irel.h|  3 ++-
 sysdeps/aarch64/dl-machine.h | 53 +++-
 sysdeps/aarch64/sysdep.h |  6 +
 3 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/sysdeps/aarch64/dl-irel.h b/sysdeps/aarch64/dl-irel.h
index 63a8e50..460454f 100644
--- a/sysdeps/aarch64/dl-irel.h
+++ b/sysdeps/aarch64/dl-irel.h
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define ELF_MACHINE_IRELA  1
 
@@ -40,7 +41,7 @@ elf_irela (const ElfW(Rela) *reloc)
   ElfW(Addr) *const reloc_addr = (void *) reloc->r_offset;
   const unsigned long int r_type = ELFW(R_TYPE) (reloc->r_info);
 
-  if (__glibc_likely (r_type == R_AARCH64_IRELATIVE))
+  if (__glibc_likely (r_type == AARCH64_R(IRELATIVE)))
 {
   ElfW(Addr) value = elf_ifunc_invoke (reloc->r_addend);
   *reloc_addr = value;
diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
index 282805e..a16cb11 100644
--- a/sysdeps/aarch64/dl-machine.h
+++ b/sysdeps/aarch64/dl-machine.h
@@ -21,6 +21,7 @@
 
 #define ELF_MACHINE_NAME "aarch64"
 
+#include 
 #include 
 #include 
 #include 
@@ -190,15 +191,15 @@ _dl_start_user:   
\n\
 ");
 
 #define elf_machine_type_class(type)   \
-  type) == R_AARCH64_JUMP_SLOT ||  \
- (type) == R_AARCH64_TLS_DTPMOD || \
- (type) == R_AARCH64_TLS_DTPREL || \
- (type) == R_AARCH64_TLS_TPREL ||  \
- (type) == R_AARCH64_TLSDESC) * ELF_RTYPE_CLASS_PLT)   \
-   | (((type) == R_AARCH64_COPY) * ELF_RTYPE_CLASS_COPY)   \
-   | (((type) == R_AARCH64_GLOB_DAT) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA))
+  type) == AARCH64_R(JUMP_SLOT) || \
+ (type) == AARCH64_R(TLS_DTPMOD) ||
\
+ (type) == AARCH64_R(TLS_DTPREL) ||
\
+ (type) == AARCH64_R(TLS_TPREL) || \
+ (type) == AARCH64_R(TLSDESC)) * ELF_RTYPE_CLASS_PLT)  \
+   | (((type) == AARCH64_R(COPY)) * ELF_RTYPE_CLASS_COPY)  \
+   | (((type) == AARCH64_R(GLOB_DAT)) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA))
 
-#define ELF_MACHINE_JMP_SLOT   R_AARCH64_JUMP_SLOT
+#define ELF_MACHINE_JMP_SLOT   AARCH64_R(JUMP_SLOT)
 
 /* AArch64 uses RELA not REL */
 #define ELF_MACHINE_NO_REL 1
@@ -237,9 +238,9 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) 
*reloc,
  void *const reloc_addr_arg, int skip_ifunc)
 {
   ElfW(Addr) *const reloc_addr = reloc_addr_arg;
-  const unsigned int r_type = ELF64_R_TYPE (reloc->r_info);
+  const unsigned int r_type = ELFW(R_TYPE) (reloc->r_info);
 
-  if (__builtin_expect (r_type == R_AARCH64_RELATIVE, 0))
+  if (__builtin_expect (r_type == AARCH64_R(RELATIVE), 0))
   *reloc_addr = map->l_addr + reloc->r_addend;
   else if (__builtin_expect (r_type == R_AARCH64_NONE, 0))
   return;
@@ -257,7 +258,7 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) 
*reloc,
 
   switch (r_type)
{
-   case R_AARCH64_COPY:
+   case AARCH64_R(COPY):
  if (sym == NULL)
  break;
 
@@ -275,15 +276,17 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) 
*reloc,
  MIN (sym->st_size, refsym->st_size));
  break;
 
-   case R_AARCH64_RELATIVE:
-   case R_AARCH64_GLOB_DAT:
-   case R_AARCH64_JUMP_SLOT:
-   case R_AARCH64_ABS32:
-   case R_AARCH64_ABS64:
+   case AARCH64_R(RELATIVE):
+   case AARCH64_R(GLOB_DAT):
+   case AARCH64_R(JUMP_SLOT):
+   case AARCH64_R(ABS32):
+#ifdef __LP64__
+   case AARCH64_R(ABS64):
+#endif
  *reloc_addr = value + reloc->r_addend;
  break;
 
-   case R_AARCH64_TLSDESC:
+   case AARCH64_R(TLSDESC):
  

[PATCH 02/27] [AARCH64] Add header guards to sysdep.h headers.

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

* sysdeps/aarch64/sysdep.h: Add header guards.

[AARCH64] Remove 64 from some relocation names as they have been renamed in 
later versions of the spec.

The AARCH64 elf ABI spec renamed some relocations removing 64 from the TLS
relocation names to make them constaint with the ILP32 named ones.

* elf/elf.h (R_AARCH64_TLS_DTPMOD64): Rename to ..
(R_AARCH64_TLS_DTPMOD): This.
(R_AARCH64_TLS_DTPREL64): Rename to ...
(R_AARCH64_TLS_DTPREL): This.
(R_AARCH64_TLS_TPREL64): Rename to ...
(R_AARCH64_TLS_TPREL): This.
* sysdeps/aarch64/dl-machine.h (elf_machine_type_class): Update
R_AARCH64_TLS_DTPMOD64, R_AARCH64_TLS_DTPREL64, and R_AARCH64_TLS_TPREL64.
(elf_machine_rela): Likewise.

[AARCH64] Fix pltenter and pltexit for ILP32.

* sysdeps/aarch64/bits/link.h (la_aarch64_gnu_pltenter): Use
ElfW macro instead of hardcoded Elf64 types.

Signed-off-by: Yury Norov 
---
 sysdeps/aarch64/sysdep.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/sysdeps/aarch64/sysdep.h b/sysdeps/aarch64/sysdep.h
index 6b728ec..594ab0b 100644
--- a/sysdeps/aarch64/sysdep.h
+++ b/sysdeps/aarch64/sysdep.h
@@ -16,6 +16,9 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>.  */
 
+#ifndef _AARCH64_SYSDEP_H
+#define _AARCH64_SYSDEP_H
+
 #include 
 
 #ifdef __ASSEMBLER__
@@ -96,3 +99,5 @@
 #define mcount _mcount
 
 #endif /* __ASSEMBLER__ */
+
+#endif  /* _AARCH64_SYSDEP_H */
-- 
2.7.4



[PATCH] no wrappers

2016-06-20 Thread Yury Norov
Signed-off-by: Yury Norov 
---
 sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c  | 31 ++
 .../unix/sysv/linux/aarch64/ilp32/fallocate64.c|  1 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate.c  |  1 +
 .../unix/sysv/linux/aarch64/ilp32/ftruncate64.c|  4 +++
 sysdeps/unix/sysv/linux/aarch64/ilp32/llseek.c | 22 +--
 sysdeps/unix/sysv/linux/aarch64/ilp32/lseek.c  |  7 +++--
 sysdeps/unix/sysv/linux/aarch64/ilp32/mmap.c   |  2 +-
 .../unix/sysv/linux/aarch64/ilp32/posix_fadvise.c  |  1 +
 .../sysv/linux/aarch64/ilp32/posix_fadvise64.c |  2 ++
 sysdeps/unix/sysv/linux/aarch64/ilp32/pread.c  |  6 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/pread64.c|  6 -
 sysdeps/unix/sysv/linux/aarch64/ilp32/pwrite.c |  6 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/pwrite64.c   |  6 -
 sysdeps/unix/sysv/linux/aarch64/ilp32/readahead.c  |  1 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/truncate.c   |  1 +
 sysdeps/unix/sysv/linux/aarch64/ilp32/truncate64.c |  4 +++
 16 files changed, 65 insertions(+), 36 deletions(-)
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/posix_fadvise.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/posix_fadvise64.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/readahead.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/truncate.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/ilp32/truncate64.c

diff --git a/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c 
b/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c
new file mode 100644
index 000..4951d06
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c
@@ -0,0 +1,31 @@
+/* Copyright (C) 2007-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include 
+#include 
+#include 
+
+
+/* Reserve storage for the data of the file associated with FD.  */
+int
+fallocate (int fd, int mode, __off_t offset, __off_t len)
+{
+  return SYSCALL_CANCEL (fallocate, fd, mode,
+__LONG_LONG_PAIR (offset >> 32, offset),
+__LONG_LONG_PAIR (len >> 32, len));
+}
+weak_alias (fallocate, fallocate64)
diff --git a/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate64.c 
b/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate64.c
new file mode 100644
index 000..f27735a
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate64.c
@@ -0,0 +1 @@
+/* See sysdeps/unix/sysv/linux/aarch64/ilp32/fallocate.c */
diff --git a/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate.c 
b/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate.c
new file mode 100644
index 000..fb5b598
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate.c
@@ -0,0 +1 @@
+/* See sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncatei64.c */
diff --git a/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate64.c 
b/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate64.c
new file mode 100644
index 000..bd0f5fe
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/ilp32/ftruncate64.c
@@ -0,0 +1,4 @@
+#include 
+
+weak_alias (__ftruncate64, __ftruncate)
+weak_alias (__ftruncate64, ftruncate)
diff --git a/sysdeps/unix/sysv/linux/aarch64/ilp32/llseek.c 
b/sysdeps/unix/sysv/linux/aarch64/ilp32/llseek.c
index f1a3207..b00ca27 100644
--- a/sysdeps/unix/sysv/linux/aarch64/ilp32/llseek.c
+++ b/sysdeps/unix/sysv/linux/aarch64/ilp32/llseek.c
@@ -1,21 +1 @@
-#include 
-#include 
-#include 
-
-#include 
-#include 
-
-#include "overflow.h"
-
-extern loff_t __llseek (int fd, loff_t offset, int whence);
-
-loff_t
-__llseek (int fd, loff_t offset, int whence)
-{
-  return INLINE_SYSCALL (_llseek, 3, fd, offset, whence);
-}
-weak_alias (__llseek, llseek)
-strong_alias (__llseek, __libc_lseek64)
-strong_alias (__llseek, __lseek64)
-weak_alias (__llseek, lseek64)
-
+#include 
diff --git a/sysdeps/unix/sysv/linux/aarch64/ilp32/lseek.c 
b/sysdeps/unix/sysv/linux/aarch64/ilp32/lseek.c
index 85d8416..9f7792f 100644
--- a/sys

[PATCH 04/27] [AARCH64] Add PTR_REG, PTR_LOG_SIZE, and PTR_SIZE. Use it in LDST_PCREL and LDST_GLOBAL.

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

This patch adds a few extra macros which are useful for ILP32 and updates
PTR_MANGLE and PTR_DEMANGLE to use the some of the new macros.

* sysdeps/aarch64/sysdep.h (PTR_REG): New macro.
(PTR_LOG_SIZE): New macro.
(PTR_SIZE): New macro.
(LDST_PCREL): Use arguments as register numbers rather register names.
Use PTR_REG for the register when loading.
* sysdeps/unix/sysv/linux/aarch64/sysdep.h (PTR_MANGLE): Add a comment
about that the arguments are register numbers rather register names.
(PTR_DEMANGLE): Likewise.
* sysdeps/aarch64/__longjmp.S (__longjmp): Update calls to PTR_DEMANGLE.
* sysdeps/aarch64/setjmp.S (__sigsetjmp): Update calls to PTR_MANGLE.

Signed-off-by: Yury Norov 
---
 sysdeps/aarch64/__longjmp.S  |  4 ++--
 sysdeps/aarch64/setjmp.S |  4 ++--
 sysdeps/aarch64/sysdep.h | 28 ++--
 sysdeps/unix/sysv/linux/aarch64/sysdep.h |  8 ++--
 4 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/sysdeps/aarch64/__longjmp.S b/sysdeps/aarch64/__longjmp.S
index 65116be..58332be 100644
--- a/sysdeps/aarch64/__longjmp.S
+++ b/sysdeps/aarch64/__longjmp.S
@@ -53,7 +53,7 @@ ENTRY (__longjmp)
ldp x27, x28, [x0, #JB_X27<<3]
 #ifdef PTR_DEMANGLE
ldp x29,  x4, [x0, #JB_X29<<3]
-   PTR_DEMANGLE (x30, x4, x3, x2)
+   PTR_DEMANGLE (30, 4, 3, 2)
 #else
ldp x29, x30, [x0, #JB_X29<<3]
 #endif
@@ -98,7 +98,7 @@ ENTRY (__longjmp)
cfi_same_value(d15)
 #ifdef PTR_DEMANGLE
ldr x4, [x0, #JB_SP<<3]
-   PTR_DEMANGLE (x5, x4, x3, x2)
+   PTR_DEMANGLE (5, 4, 3, 2)
 #else
ldr x5, [x0, #JB_SP<<3]
 #endif
diff --git a/sysdeps/aarch64/setjmp.S b/sysdeps/aarch64/setjmp.S
index 22f4368..da83f19 100644
--- a/sysdeps/aarch64/setjmp.S
+++ b/sysdeps/aarch64/setjmp.S
@@ -42,7 +42,7 @@ ENTRY (__sigsetjmp)
stp x27, x28, [x0, #JB_X27<<3]
 
 #ifdef PTR_MANGLE
-   PTR_MANGLE (x4, x30, x3, x2)
+   PTR_MANGLE (4, 30, 3, 2)
stp x29,  x4, [x0, #JB_X29<<3]
 #else
stp x29, x30, [x0, #JB_X29<<3]
@@ -57,7 +57,7 @@ ENTRY (__sigsetjmp)
stp d14, d15, [x0, #JB_D14<<3]
 #ifdef PTR_MANGLE
mov x4, sp
-   PTR_MANGLE (x5, x4, x3, x2)
+   PTR_MANGLE (5, 4, 3, 2)
str x5, [x0, #JB_SP<<3]
 #else
mov x2,  sp
diff --git a/sysdeps/aarch64/sysdep.h b/sysdeps/aarch64/sysdep.h
index 4cb028f..f2ea821 100644
--- a/sysdeps/aarch64/sysdep.h
+++ b/sysdeps/aarch64/sysdep.h
@@ -23,10 +23,16 @@
 
 #ifdef __LP64__
 #define AARCH64_R(NAME)R_AARCH64_ ## NAME
+#define PTR_REG(n) x##n
+#define PTR_LOG_SIZE   3
 #else
 #define AARCH64_R(NAME)R_AARCH64_P32_ ## NAME
+#define PTR_REG(n) w##n
+#define PTR_LOG_SIZE   2
 #endif
 
+#define PTR_SIZE   (1<

[PATCH 07/27] [AARCH64] Use PTR_* macros in dl-trampoline.S

2016-06-20 Thread Yury Norov
From: Andrew Pinski 

Use the PTR_* macros in dl-trampoline.S so it can be used for
both ILP32 and LP64.  Also add a comment about what was an magic number
(the size of the rela relocation entries).

* sysdeps/aarch64/dl-trampoline.S (ip0l): New define.
(RELA_SIZE): New define.
(_dl_runtime_resolve): Use PTR_REG, PTR_SIZE.
(_dl_runtime_profile): Likewise.  Use RELA_SIZE and ip0l.

Signed-off-by: Yury Norov 
---
 sysdeps/aarch64/dl-trampoline.S | 18 +++---
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/sysdeps/aarch64/dl-trampoline.S b/sysdeps/aarch64/dl-trampoline.S
index 947a515..63ef6f7 100644
--- a/sysdeps/aarch64/dl-trampoline.S
+++ b/sysdeps/aarch64/dl-trampoline.S
@@ -22,9 +22,13 @@
 #include "dl-link.h"
 
 #define ip0 x16
+#define ip0l PTR_REG (16)
 #define ip1 x17
 #define lr  x30
 
+/* RELA relocatons are 3 pointers */
+#define RELA_SIZE (PTR_SIZE * 3)
+
.text
.globl _dl_runtime_resolve
.type _dl_runtime_resolve, #function
@@ -79,7 +83,7 @@ _dl_runtime_resolve:
cfi_rel_offset (q1, 80+7*16)
 
/* Get pointer to linker struct.  */
-   ldr x0, [ip0, #-8]
+   ldr PTR_REG (0), [ip0, #-PTR_SIZE]
 
/* Prepare to call _dl_fixup().  */
ldr x1, [sp, 80+8*16]   /* Recover &PLTGOT[n] */
@@ -87,7 +91,7 @@ _dl_runtime_resolve:
sub x1, x1, ip0
add x1, x1, x1, lsl #1
lsl x1, x1, #3
-   sub x1, x1, #192
+   sub x1, x1, #(RELA_SIZE<<3)
lsr x1, x1, #3
 
/* Call fixup routine.  */
@@ -191,7 +195,7 @@ _dl_runtime_profile:
stp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_SP]
 
/* Get pointer to linker struct.  */
-   ldr x0, [ip0, #-8]
+   ldr PTR_REG (0), [ip0, #-PTR_SIZE]
 
/* Prepare to call _dl_profile_fixup().  */
ldr x1, [x29, OFFSET_PLTGOTN]   /* Recover &PLTGOT[n] */
@@ -199,7 +203,7 @@ _dl_runtime_profile:
sub x1, x1, ip0
add x1, x1, x1, lsl #1
lsl x1, x1, #3
-   sub x1, x1, #192
+   sub x1, x1, #(RELA_SIZE<<3)
lsr x1, x1, #3
 
stp x0, x1, [x29, #OFFSET_SAVED_CALL_X0]
@@ -210,8 +214,8 @@ _dl_runtime_profile:
add x4, x29, #OFFSET_FS /* address of framesize */
bl  _dl_profile_fixup
 
-   ldr ip0, [x29, #OFFSET_FS]  /* framesize == 0 */
-   cmp ip0, #0
+   ldr ip0l, [x29, #OFFSET_FS] /* framesize == 0 */
+   cmp ip0l, #0
bge 1f
cfi_remember_state
 
@@ -243,7 +247,7 @@ _dl_runtime_profile:
 1:
/* The new frame size is in ip0.  */
 
-   sub x1, x29, ip0
+   sub PTR_REG (1), PTR_REG (29), ip0l
and sp, x1, #0xfff0
 
str x0, [x29, #OFFSET_T1]
-- 
2.7.4



[PATCH 25/27] [AARCH64] Fix ILP32 warning

2016-06-21 Thread Yury Norov
From: Andrew Pinski 

Signed-off-by: Yury Norov 
---
 sysdeps/aarch64/jmpbuf-unwind.h  | 2 +-
 sysdeps/unix/sysv/linux/aarch64/sigcontextinfo.h | 2 +-
 sysdeps/unix/sysv/linux/generic/brk.c| 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sysdeps/aarch64/jmpbuf-unwind.h b/sysdeps/aarch64/jmpbuf-unwind.h
index 3e0a37d..5603828 100644
--- a/sysdeps/aarch64/jmpbuf-unwind.h
+++ b/sysdeps/aarch64/jmpbuf-unwind.h
@@ -27,7 +27,7 @@
   ((void *) (address) < (void *) demangle (jmpbuf[JB_SP]))
 
 #define _JMPBUF_CFA_UNWINDS_ADJ(jmpbuf, context, adj) \
-  _JMPBUF_UNWINDS_ADJ (jmpbuf, (void *) _Unwind_GetCFA (context), adj)
+  _JMPBUF_UNWINDS_ADJ (jmpbuf, (void *)(size_t) _Unwind_GetCFA (context), adj)
 
 #define _JMPBUF_UNWINDS_ADJ(_jmpbuf, _address, _adj) \
   ((uintptr_t) (_address) - (_adj) < _jmpbuf_sp (_jmpbuf) - (_adj))
diff --git a/sysdeps/unix/sysv/linux/aarch64/sigcontextinfo.h 
b/sysdeps/unix/sysv/linux/aarch64/sigcontextinfo.h
index a579501..ee54222 100644
--- a/sysdeps/unix/sysv/linux/aarch64/sigcontextinfo.h
+++ b/sysdeps/unix/sysv/linux/aarch64/sigcontextinfo.h
@@ -19,7 +19,7 @@
 #include 
 
 #define SIGCONTEXT siginfo_t *_si, struct ucontext *
-#define GET_PC(ctx) ((void *) (ctx)->uc_mcontext.pc)
+#define GET_PC(ctx) ((void *) (size_t) (ctx)->uc_mcontext.pc)
 
 /* There is no reliable way to get the sigcontext unless we use a
three-argument signal handler.  */
diff --git a/sysdeps/unix/sysv/linux/generic/brk.c 
b/sysdeps/unix/sysv/linux/generic/brk.c
index 1b84004..f369e06 100644
--- a/sysdeps/unix/sysv/linux/generic/brk.c
+++ b/sysdeps/unix/sysv/linux/generic/brk.c
@@ -33,7 +33,7 @@ __brk (void *addr)
 {
   INTERNAL_SYSCALL_DECL (err);
 
-  __curbrk = (void *) INTERNAL_SYSCALL (brk, err, 1, addr);
+  __curbrk = (void *) (size_t) INTERNAL_SYSCALL (brk, err, 1, addr);
   if (__curbrk < addr)
 {
   __set_errno (ENOMEM);
-- 
2.7.4



Re: [PATCH 04/27] [AARCH64] Add PTR_REG, PTR_LOG_SIZE, and PTR_SIZE. Use it in LDST_PCREL and LDST_GLOBAL.

2016-06-21 Thread Yury Norov
On Tue, Jun 21, 2016 at 09:54:47AM +0200, Andreas Schwab wrote:
> Yury Norov  writes:
> 
> > diff --git a/sysdeps/aarch64/sysdep.h b/sysdeps/aarch64/sysdep.h
> > index 4cb028f..f2ea821 100644
> > --- a/sysdeps/aarch64/sysdep.h
> > +++ b/sysdeps/aarch64/sysdep.h
> > @@ -23,10 +23,16 @@
> >  
> >  #ifdef __LP64__
> >  #define AARCH64_R(NAME)R_AARCH64_ ## NAME
> > +#define PTR_REG(n) x##n
> > +#define PTR_LOG_SIZE   3
> >  #else
> >  #define AARCH64_R(NAME)R_AARCH64_P32_ ## NAME
> > +#define PTR_REG(n) w##n
> > +#define PTR_LOG_SIZE   2
> >  #endif
> >  
> > +#define PTR_SIZE   (1< > +
> >  #ifdef __ASSEMBLER__
> >  
> >  /* Syntactic details of assembler.  */
> > @@ -87,16 +93,18 @@
> >  # define L(name) .L##name
> >  #endif
> >  
> > -/* Load or store to/from a pc-relative EXPR into/from R, using T.  */
> > -#define LDST_PCREL(OP, R, T, EXPR)  \
> > -   adrpT, EXPR;\
> > -   OP  R, [T, #:lo12:EXPR];\
> > -
> > -/* Load or store to/from a got-relative EXPR into/from R, using T.  */
> > -#define LDST_GLOBAL(OP, R, T, EXPR) \
> > -   adrpT, :got:EXPR;   \
> > -   ldr T, [T, #:got_lo12:EXPR];\
> > -   OP  R, [T];
> > +/* Load or store to/from a pc-relative EXPR into/from R, using T.
> > +   Note R and T are register numbers and not register names.  */
> > +#define LDST_PCREL(OP, R, T, EXPR) \
> > +   adrpx##T, EXPR; \
> > +   OP  PTR_REG (R), [x##T, #:lo12:EXPR];   \
> > +
> > +/* Load or store to/from a got-relative EXPR into/from R, using T.
> > +   Note R and T are register numbers and not register names.  */
> > +#define LDST_GLOBAL(OP, R, T,  EXPR)   \
> > +   adrpx##T, :got:EXPR;\
> > +   ldr PTR_REG (T), [x##T, #:got_lo12:EXPR];   \
> > +   OP  x##R, [x##T];
> 
> I think this needs to be PTR_REG(x).
> 
> Andreas.

Hi Andreas,

Thanks a lot for review. I will handle your comments and send new
version soon, maybe tomorrow.


Re: [PATCH v3] tools/perf: Fix the mask in regs_dump__printf and print_sample_iregs

2016-06-21 Thread Yury Norov
On Tue, Jun 21, 2016 at 08:26:40PM +0530, Madhavan Srinivasan wrote:
> When decoding the perf_regs mask in regs_dump__printf(),
> we loop through the mask using find_first_bit and find_next_bit functions.
> "mask" is of type "u64", but sent as a "unsigned long *" to
> lib functions along with sizeof().
> 
> While the exisitng code works fine in most of the case,
> the logic is broken when using a 32bit perf on a 64bit kernel (Big Endian).
> When reading u64 using (u32 *)(&val)[0], perf (lib/find_*_bit()) assumes it 
> gets
> lower 32bits of u64 which is wrong. Proposed fix is to swap the words
> of the u64 to handle this case. This is _not_ endianess swap.
> 
> Suggested-by: Yury Norov 
> Cc: Yury Norov 
> Cc: Peter Zijlstra 
> Cc: Ingo Molnar 
> Cc: Arnaldo Carvalho de Melo 
> Cc: Alexander Shishkin 
> Cc: Jiri Olsa 
> Cc: Adrian Hunter 
> Cc: Kan Liang 
> Cc: Wang Nan 
> Cc: Michael Ellerman 
> Signed-off-by: Madhavan Srinivasan 
> ---
> Changelog v2:
> 1)Moved the swap code to a common function
> 2)Added more comments in the code
> 
> Changelog v1:
> 1)updated commit message and patch subject
> 2)Add the fix to print_sample_iregs() in builtin-script.c
> 
>  tools/include/linux/bitmap.h |  9 +

What about include/linux/bitmap.h? I think we'd place it there first.

>  tools/perf/builtin-script.c  | 16 +++-
>  tools/perf/util/session.c| 16 +++-
>  3 files changed, 39 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
> index 28f5493da491..79998b26eb04 100644
> --- a/tools/include/linux/bitmap.h
> +++ b/tools/include/linux/bitmap.h
> @@ -2,6 +2,7 @@
>  #define _PERF_BITOPS_H
>  
>  #include 
> +#include 
>  #include 
>  
>  #define DECLARE_BITMAP(name,bits) \
> @@ -22,6 +23,14 @@ void __bitmap_or(unsigned long *dst, const unsigned long 
> *bitmap1,
>  #define small_const_nbits(nbits) \
>   (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
>  
> +static inline void bitmap_from_u64(unsigned long *_mask, u64 mask)

Inline is not required. Some people don't not like it. Underscored parameter in
function declaration is not the best idea as well. Try:
static void bitmap_from_u64(unsigned long *bitmap, u64 mask)

> +{
> + _mask[0] = mask & ULONG_MAX;
> +
> + if (sizeof(mask) > sizeof(unsigned long))
> + _mask[1] = mask >> 32;
> +}
> +
>  static inline void bitmap_zero(unsigned long *dst, int nbits)
>  {
>   if (small_const_nbits(nbits))
> diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
> index e3ce2f34d3ad..73928310fd91 100644
> --- a/tools/perf/builtin-script.c
> +++ b/tools/perf/builtin-script.c
> @@ -412,11 +412,25 @@ static void print_sample_iregs(struct perf_sample 
> *sample,
>   struct regs_dump *regs = &sample->intr_regs;
>   uint64_t mask = attr->sample_regs_intr;
>   unsigned i = 0, r;
> + unsigned long _mask[sizeof(mask)/sizeof(unsigned long)];

If we start with it, I think we'd hide declaration machinery as well:

#define DECLARE_L64_BITMAP(__name) unsigned long 
__name[sizeof(u64)/sizeof(unsigned long)]
or
#define L64_BITMAP_SIZE (sizeof(u64)/sizeof(unsigned long))

Or both :) Whatever you prefer.

>  
>   if (!regs)
>   return;
>  
> - for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) {
> + /*
> +  * Since u64 is passed as 'unsigned long *', check
> +  * to see whether we need to swap words within u64.
> +  * Reason being, in 32 bit big endian userspace on a
> +  * 64bit kernel, 'unsigned long' is 32 bits.
> +  * When reading u64 using (u32 *)(&val)[0] and (u32 *)(&val)[1],
> +  * we will get wrong value for the mask. This is what
> +  * find_first_bit() and find_next_bit() is doing.
> +  * Issue here is "(u32 *)(&val)[0]" gets upper 32 bits of u64,
> +  * but perf assumes it gets lower 32bits of u64. Hence the check
> +  * and swap.
> +  */
> + bitmap_from_u64(_mask, mask);
> + for_each_set_bit(r, _mask, sizeof(mask) * 8) {
>   u64 val = regs->regs[i++];
>   printf("%5s:0x%"PRIx64" ", perf_reg_name(r), val);
>   }
> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index 5214974e841a..1337b1c73f82 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -940,8 +940,22 @@ static void branch_stack__printf(struct perf_sample 
> *sample)
>  static void regs_dump__printf(u64 mask, u64 *regs)
>  {
>   unsign

Re: [PATCH] mm: slab.h: use ilog2() in kmalloc_index()

2016-06-21 Thread Yury Norov
On Tue, Jun 21, 2016 at 02:52:37PM -0700, Andrew Morton wrote:
> On Tue, 21 Jun 2016 02:33:06 +0300 Yury Norov  wrote:
> 
> > kmalloc_index() uses simple straightforward way to calculate
> > bit position of nearest or equal upper power of 2.
> > This effectively results in generation of 24 episodes of
> > compare-branch instructions in assembler.
> > 
> > There is shorter way to calculate this: fls(size - 1).
> > 
> > The patch removes hard-coded calculation of kmalloc slab and
> > uses ilog2() instead that works on top of fls(). ilog2 is used
> > with intention that compiler also might optimize constant case
> > during compile time if it detects that.
> > 
> > BUG() is moved to the beginning of function. We left it here to
> > provide identical behaviour to previous version. It may be removed
> > if there's no requirement in it anymore.
> > 
> > While we're at this, fix comment that describes return value.
> 
> kmalloc_index() is always called with a constant-valued `size' (see
> __builtin_constant_p() tests)

It might change one day. This function is public to any slab user.
If you really want to allow call kmalloc_index() for constants only,
you'd place __builtin_constant_p() tests inside kmalloc_index().

> so the compiler will evaluate the switch
> statement at compile-time.  This will be more efficient than calling
> fls() at runtime.

There will be no fls() for constant at runtime because ilog2() calculates 
constant values at compile-time as well. From this point of view,
this patch removes code duplication, as we already have compile-time
log() calculation in kernel, and should re-use it whenever possible.\

Yury.


Re: [RFC PATCH 00/27] ARM64: support ILP32

2016-06-21 Thread Yury Norov
On Tue, Jun 21, 2016 at 08:06:18PM +0800, Zhangjian (Bamvor) wrote:
> Hi,
> 
> 
> In our test, we need to fix stack pointer in makecontext. Not sure
> if it should be a standalone patch:

Thank you, I'll take it.

> 
> From 1d51ca34034ef83ea602874a93e26fd158ddd214 Mon Sep 17 00:00:00 2001
> From: Jun Ji 
> Date: Fri, 29 Apr 2016 17:20:23 +0800
> Subject: [PATCH] fix for makecontext error
> 
> Signed-off-by: Jun Ji 
> 
> ---
>  sysdeps/unix/sysv/linux/aarch64/makecontext.c | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/sysdeps/unix/sysv/linux/aarch64/makecontext.c 
> b/sysdeps/unix/sysv/linux/aarch64/makecontext.c
> index 34f91a3..55a26a3 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/makecontext.c
> +++ b/sysdeps/unix/sysv/linux/aarch64/makecontext.c
> @@ -42,18 +42,18 @@ void
>  __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...)
>  {
>extern void __startcontext (void);
> -  unsigned long int *sp;
> +  unsigned long long *sp;
>va_list ap;
>int i;
> 
> -  sp = (unsigned long int *)
> +  sp = (unsigned long long *)
>  ((uintptr_t) ucp->uc_stack.ss_sp + ucp->uc_stack.ss_size);
> 
>/* Allocate stack arguments.  */
>sp -= argc < 8 ? 0 : argc - 8;
> 
>/* Keep the stack aligned.  */
> -  sp = (unsigned long int *) (((uintptr_t) sp) & -16L);
> +  sp = (unsigned long long *) (((uintptr_t) sp) & -16L);
> 
>ucp->uc_mcontext.regs[19] = (uintptr_t) ucp->uc_link;
>ucp->uc_mcontext.sp = (uintptr_t) sp;
> @@ -64,9 +64,9 @@ __makecontext (ucontext_t *ucp, void (*func) (void), int 
> argc, ...)
>va_start (ap, argc);
>for (i = 0; i < argc; ++i)
>  if (i < 8)
> -  ucp->uc_mcontext.regs[i] = va_arg (ap, unsigned long int);
> +  ucp->uc_mcontext.regs[i] = va_arg (ap, unsigned long long);
>  else
> -  sp[i - 8] = va_arg (ap, unsigned long int);
> +  sp[i - 8] = va_arg (ap, unsigned long long);
> 
>va_end (ap);
>  }
> -- 
> 1.8.4.5
> 
> Regards
> 
> Bamvor
> 
> On 2016/6/21 13:06, Yury Norov wrote:
> >This series enables aarch64 port with ilp32 mode.
> >
> >After long discussions in kernel list, we finally got
> >consensus on how ABI should look. This patchset adds
> >support for the ABI in GLIBC. It is tested with LTP
> >with no big regressions comparing to LP64 and AARCH32.
> >
> >Though it's very raw. Please be patient reviewing it.
> >
> >ABI details:
> >  - types are taken from AARCH32, next types turned to 64-bit,
> >as modern requirement for new APIs tells:
> > ino_t  is  u64 type
> > off_t  is  s64 type
> > blkcnt_t   is  s64 type
> > fsblkcnt_t is  u64 type
> > fsfilcnt_t is  u64 type
> >  - 64-bit arguments are passed in syscall as register pair,
> >as kernel internally clears top halves for all input regs;
> >  - standard syscall table is used;
> >  - 32-bit time_t is used. AARCH64/ILP32 is waiting for general
> >fix of Y2038 problem just like other 32-bit arches;
> >  - stat{64}, statfs{64} structures are of the identical layout
> >with LP64. Corresponding syscalls are taken from 64-bit code.
> >
> >Links:
> >This series: https://github.com/norov/glibc/commits/ilp32-2.23
> >Kernel series: https://github.com/norov/linux/commits/ilp32-nowrap
> >Kernel in LKML: https://lkml.org/lkml/2016/6/17/990
> >
> >Please review it. Any comments appreciated.
> >
> >Yury.
> >
> >Andrew Pinski (24):
> >   [AARCH64] Fix utmp struct for compatibility reasons.
> >   [AARCH64] Add header guards to sysdep.h headers.
> >   Add dynamic ILP32 AARCH64 relocations to elf.h
> >   [AARCH64] Add PTR_REG, PTR_LOG_SIZE, and PTR_SIZE.  Use it in
> > LDST_PCREL and LDST_GLOBAL.
> >   [AARCH64] Use PTR_REG in crti.S.
> >   [AARCH64] Use PTR_REG/PTR_SIZE/PTR_SIZE_LOG in dl-tlsesc.S
> >   [AARCH64] Use PTR_* macros in dl-trampoline.S
> >   [AARCH64] Use PTR_* in start.S
> >   [AARCH64] Use PTR_REG in getcontext.S.
> >   [AARCH64] Detect ILP32 in configure scripts.
> >   [AARCH64] Syscalls for ILP32 are passed always via 64bit values.
> >   [AARCH64] Add ILP32 support to elf_machine_load_address.
> >   [AARCH64] Set up wordsize for ILP32.
> >   [AARCH64] Add ILP32 to makefiles
> >   [AARCH64] Add support to ldconfig for ILP32 and libilp32
> >   [AARCH64] Add ILP32 ld.so to the known interpreter names.
> >   [AARCH64] Add ldd-rewrite.sed so that ilp32 ld.so can be found
> >   [AARCH64] Add kernel_sigaction.h for AA

Re: [PATCH 05/27] [AARCH64] Use PTR_REG in crti.S.

2016-06-21 Thread Yury Norov
On Tue, Jun 21, 2016 at 10:28:40AM +, Joseph Myers wrote:
> On Tue, 21 Jun 2016, Yury Norov wrote:
> 
> > +#ifdef __LP64__
> > +#define RTLD_START RTLD_START_1("x", "3", "sp")
> > +#else
> > +#define RTLD_START RTLD_START_1("w", "2", "wsp")
> > +#endif
> 
> As well as preprocessor indentation, this is missing spaces after '('; 
> check for and fix that issue throughout this patch series.  (There are 
> only a few cases where a macro is used to construct a type / variable 
> name, such as ElfW, where missing the space is more usual.)
> 
> -- 
> Joseph S. Myers
> jos...@codesourcery.com

So, you want have it like this?
#ifdef __LP64__
# define RTLD_START RTLD_START_1( "x", "3", "sp" )
#else
# define RTLD_START RTLD_START_1( "w", "2", "wsp" )
#endif

Pretty contr-intuitive to me...
Anyway, I follow it.


Re: [RFC2 PATCH 00/23] ARM64: support ILP32

2016-09-08 Thread Yury Norov
On Thu, Sep 08, 2016 at 12:25:27PM +0800, Zhangjian (Bamvor) wrote:
> Hi, Guys
> 
> There was a discussion about bump vdso version of kernel. We need
> update the vdso version in glibc correspondingly otherwise the
> application could not make use of the vdso.
> 
> Is it make sense to you?
> 
> Regards
> 
> Bamvor

Hi Bamvor,

Is this the source of performance loss you've discovered?

> 
> commit 3ffc1d798fc25ccb02e7cc325fe5fb3890c085e3
> Author: Bamvor Jian Zhang 
> Date:   Thu Sep 8 12:21:16 2016 +0800
> 
> [AARCH64] ILP32: bump vdso version consistent with kernel
> 
> Signed-off-by: Jun Ji 
> Signed-off-by: Bamvor Jian Zhang 
> 
> diff --git a/sysdeps/unix/sysv/linux/aarch64/init-first.c 
> b/sysdeps/unix/sysv/linux/aarch64/init-first.c
> index f7224a2..3e4eaad 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/init-first.c
> +++ b/sysdeps/unix/sysv/linux/aarch64/init-first.c
> @@ -27,17 +27,21 @@ int (*VDSO_SYMBOL(clock_getres)) (clockid_t, struct 
> timespec *);
>  static inline void
>  _libc_vdso_platform_setup (void)
>  {
> -  PREPARE_VERSION (linux2639, "LINUX_2.6.39", 123718537);
> +#ifndef __ILP32__
> +  PREPARE_VERSION (linux, "LINUX_2.6.39", 123718537);
> +#else
> +  PREPARE_VERSION (linux, "LINUX_4.8", 61765624);
> +#endif /* #ifndef __ILP32__ */
> 
> -  void *p = _dl_vdso_vsym ("__kernel_gettimeofday", &linux2639);
> +  void *p = _dl_vdso_vsym ("__kernel_gettimeofday", &linux);
>PTR_MANGLE (p);
>VDSO_SYMBOL(gettimeofday) = p;
> 
> -  p = _dl_vdso_vsym ("__kernel_clock_gettime", &linux2639);
> +  p = _dl_vdso_vsym ("__kernel_clock_gettime", &linux);
>PTR_MANGLE (p);
>VDSO_SYMBOL(clock_gettime) = p;
> 
> -  p = _dl_vdso_vsym ("__kernel_clock_getres", &linux2639);
> +  p = _dl_vdso_vsym ("__kernel_clock_getres", &linux);
>PTR_MANGLE (p);
>VDSO_SYMBOL(clock_getres) = p;
>  }
> 
> 
> On 2016/6/29 0:39, Yury Norov wrote:
> >This series enables aarch64 port with ilp32 mode.
> >
> >ABI details:
> > - types are taken from AARCH32, next types turned to 64-bit,
> >   as modern requirement for new APIs tells:
> > ino_t  is  u64 type
> > off_t  is  s64 type
> > blkcnt_t   is  s64 type
> > fsblkcnt_t is  u64 type
> > fsfilcnt_t is  u64 type
> > - 64-bit arguments are passed in syscall as register pair,
> >   as kernel internally clears top halves for all input regs;
> > - standard syscall table is used;
> > - 32-bit time_t is used. AARCH64/ILP32 is waiting for general
> >   fix of Y2038 problem just like other 32-bit arches;
> > - stat{64}, statfs{64} structures are of the identical layout
> >   with LP64. Corresponding syscalls are taken from 64-bit code
> > - struct utmp, utmpx layouts are the same.
> >
> >v1: https://sourceware.org/ml/libc-alpha/2016-06/msg00730.html
> >v2:
> > - rebased on master;
> > - dropped unneeded/unrelated patches;
> > - pread family platform implementation removed;
> > - addressed v1 comments (I'm really sorry if I missed something,
> >   there are a lot of them, and I am really thankfull for detailed review);
> >
> >Tested with LTP. Regressions are like this:
> >abort01FAIL   2
> >kill11 FAIL   2
> >mmap16 FAIL   6
> >open12 FAIL   2
> >rename11   FAIL   2
> >rmdir02FAIL   2
> >umount2_01 FAIL   2
> >umount2_02 FAIL   2
> >umount2_03 FAIL   2
> >utime06FAIL   2
> >
> >It's better than v1, and there are ~5 additional regressions comparing to
> >aarch64, all are related to filesystem.
> >
> >Andrew Pinski (17):
> >  [AARCH64] define word size for lp64 and ilp32
> >  [AARCH64] Add header guards to sysdep.h headers.
> >  Add dynamic ILP32 AARCH64 relocations to elf.h
> >  [AARCH64] Add PTR_REG, PTR_LOG_SIZE, and PTR_SIZE.  Use it in
> >LDST_PCREL and LDST_GLOBAL.
> >  [AARCH64] Use PTR_REG in crti.S.
> >  [AARCH64] Use PTR_REG/PTR_SIZE/PTR_SIZE_LOG in dl-tlsesc.S
> >  [AARCH64] Use PTR_* macros in dl-trampoline.S
> >  [AARCH64] Use PTR_* in start.S
> >  [AARCH64] Use PTR_REG in getcontext.S.
> >  [AARCH64] Detect ILP32 in configure scripts.
> >  [AARCH64] Add ILP32 support to elf_machine_load_address.
> >  [AARCH64] Add ILP32 to mak

Re: [PATCH 12/18] arm64: ilp32: add sys_ilp32.c and a separate table (in entry.S) to use it

2016-09-02 Thread Yury Norov
On Fri, Sep 02, 2016 at 02:55:34PM +0200, Arnd Bergmann wrote:
> On Friday, September 2, 2016 6:46:19 PM CEST Bamvor Jian Zhang wrote:
> > diff --git a/arch/arm64/include/uapi/asm/unistd.h 
> > b/arch/arm64/include/uapi/asm/unistd.h
> > index 043d17a..78bea1d 100644
> > --- a/arch/arm64/include/uapi/asm/unistd.h
> > +++ b/arch/arm64/include/uapi/asm/unistd.h
> > @@ -16,4 +16,9 @@
> > 
> >  #define __ARCH_WANT_RENAMEAT
> > 
> > +/* We need to make sure it works for both userspace and 
> > kernel(sys_ilp32.c) */
> > +#if defined(__ILP32__) || defined(__SYSCALL_COMPAT)
> > +#define __ARCH_WANT_SYNC_FILE_RANGE2
> > +#endif
> > +
> >  #include 
> > diff --git a/arch/arm64/kernel/sys_ilp32.c b/arch/arm64/kernel/sys_ilp32.c
> > index 10fc0ca..13c9c9d 100644
> > --- a/arch/arm64/kernel/sys_ilp32.c
> > +++ b/arch/arm64/kernel/sys_ilp32.c
> > @@ -42,7 +42,7 @@
> >  #define compat_sys_pwrite64compat_sys_pwrite64_wrapper
> >  #define compat_sys_readahead   compat_sys_readahead_wrapper
> >  #define compat_sys_shmat   sys_shmat
> > -#define compat_sys_sync_file_range compat_sys_sync_file_range2_wrapper
> > +#define compat_sys_sync_file_range2compat_sys_sync_file_range2_wrapper
> >  #define compat_sys_truncate64  compat_sys_truncate64_wrapper
> >  #define sys_mmap2  compat_sys_mmap2_wrapper
> >  #define sys_ptrace compat_sys_ptrace
> > 
> 
> Looks good to me.
> 
>   Arnd

Thank you. I'll take it.

Yury.


Re: [PATCH 16/20] arm64: signal32: move ilp32 and aarch32 common code to separated file

2017-06-20 Thread Yury Norov
On Mon, Jun 19, 2017 at 05:16:42PM +0100, James Morse wrote:
> Hi Yury,
> 
> On 04/06/17 13:00, Yury Norov wrote:
> > Signed-off-by: Yury Norov 
> 
> Can I offer a body for the commit message:
> ILP32 needs to mix 32bit struct siginfo and 64bit sigframe for its signal
> handlers. Move the existing compat code for copying siginfo to user space and
> manipulating signal masks into signal32_common.c so it can be used to deliver
> aarch32 and ilp32 signals.

Ok

> > diff --git a/arch/arm64/include/asm/signal32.h 
> > b/arch/arm64/include/asm/signal32.h
> > index e68fcce538e1..1c4ede717bd2 100644
> > --- a/arch/arm64/include/asm/signal32.h
> > +++ b/arch/arm64/include/asm/signal32.h
> > @@ -13,6 +13,9 @@
> >   * You should have received a copy of the GNU General Public License
> >   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> >   */
> > +
> > +#include 
> > +
> >  #ifndef __ASM_SIGNAL32_H
> >  #define __ASM_SIGNAL32_H
> 
> Nit: This should go inside the guard.
 
Ok, thanks. Will fix this and all below
 
> > diff --git a/arch/arm64/kernel/signal32_common.c 
> > b/arch/arm64/kernel/signal32_common.c
> > new file mode 100644
> > index ..5bddc25dca12
> > --- /dev/null
> > +++ b/arch/arm64/kernel/signal32_common.c
> > @@ -0,0 +1,135 @@
> [...]
> > +#include 
> > +#include 
> > +#include 
> 
> What do you need ratelimit.h for?
> 
> 
> > +#include 
> > +
> > +#include 
> 
> I can't see anything using these ESR_ macros in here...
> 
> 
> > +#include 
> 
> This was for the VFP save/restore code, which you didn't move...
> 
> 
> > +#include 
> > +#include 
> 
> [...]
> 
> 
> > +int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t 
> > *from)
> [...]
> > +   case __SI_FAULT:
> > +   err |= __put_user((compat_uptr_t)(unsigned long)from->si_addr,
> > + &to->si_addr);
> 
> This looks tricky. si_addr comes from FAR_EL1 when user-space touches 
> something
> it shouldn't. This could be a 64bit value as ilp32 processes can still branch 
> to
> 64bit addresses in registers and generate loads that cross the invisible 4GB
> boundary. Here you truncate the 64bit address.
> Obviously this can't happen at all with aarch32, and for C programs its into
> undefined-behaviour territory, but it doesn't feel right to pass an address to
> user-space that we know is wrong... but we don't have an alternative.
> 
> This looks like a class of problem particular to ilp32/x32: 'accessed an 
> address
> you can't encode with a signal'. After a quick dig in x86's x32 code, it looks
> like they only pass the first 32bits of si_addr too.
> 
> One option is to mint a new si_code to go with SIGBUS meaning something like
> 'address overflowed si_addr'. Alternatively we could just kill tasks that do 
> this.

New SIGBUS sounds reasonable at the first glance, but I think it should be
discussed widely at first, and the patch that implements it should touch
all arches that may be affected.

Yury


Re: [PATCH v3 2/4] asm-generic: Provide a fncpy() implementation

2017-06-20 Thread Yury Norov
On Mon, Jun 19, 2017 at 01:58:53PM -0700, Florian Fainelli wrote:
> On 06/18/2017 04:51 PM, Yury Norov wrote:
> > Hi Florian,
> > 
> > Some questions and thoughts inline.
> > 
> > Yury
> > 
> > On Fri, Jun 16, 2017 at 05:07:42PM -0700, Florian Fainelli wrote:
> >> Define a generic fncpy() implementation largely based on the ARM version
> >> that requires an 8 bytes alignment for the destination address where to
> >> copy this function as well as the function's own address.
> >>
> >> Signed-off-by: Florian Fainelli 
> >> ---
> >>  include/asm-generic/fncpy.h | 93 
> >> +
> >>  1 file changed, 93 insertions(+)
> >>  create mode 100644 include/asm-generic/fncpy.h
> >>
> >> diff --git a/include/asm-generic/fncpy.h b/include/asm-generic/fncpy.h
> >> new file mode 100644
> >> index ..ec03b83b8535
> >> --- /dev/null
> >> +++ b/include/asm-generic/fncpy.h
> >> @@ -0,0 +1,93 @@
> >> +/*
> >> + * include/asm-generic/fncpy.h - helper macros for function body copying
> >> + *
> >> + * Copyright (C) 2011 Linaro Limited
> >> + *
> >> + * This program is free software; you can redistribute it and/or modify
> >> + * it under the terms of the GNU General Public License version 2 as
> >> + * published by the Free Software Foundation.
> >> + *
> >> + * This program is distributed in the hope that it will be useful,
> >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >> + * GNU General Public License for more details.
> >> + *
> >> + * You should have received a copy of the GNU General Public License
> >> + * along with this program; if not, write to the Free Software
> >> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
> >> + */
> >> +
> >> +/*
> >> + * These macros are intended for use when there is a need to copy a 
> >> low-level
> >> + * function body into special memory.
> >> + *
> >> + * For example, when reconfiguring the SDRAM controller, the code doing 
> >> the
> >> + * reconfiguration may need to run from SRAM.
> >> + *
> >> + * NOTE: that the copied function body must be entirely self-contained and
> >> + * position-independent in order for this to work properly.
> >> + *
> >> + * NOTE: in order for embedded literals and data to get referenced 
> >> correctly,
> >> + * the alignment of functions must be preserved when copying.  To ensure 
> >> this,
> >> + * the source and destination addresses for fncpy() must be aligned to a
> >> + * multiple of 8 bytes: you will be get a BUG() if this condition is not 
> >> met.
> >> + * You will typically need a ".align 3" directive in the assembler where 
> >> the
> >> + * function to be copied is defined, and ensure that your allocator for 
> >> the
> >> + * destination buffer returns 8-byte-aligned pointers.
> >> + *
> >> + * Typical usage example:
> >> + *
> >> + * extern int f(args);
> >> + * extern uint32_t size_of_f;
> >> + * int (*copied_f)(args);
> >> + * void *sram_buffer;
> >> + *
> >> + * copied_f = fncpy(sram_buffer, &f, size_of_f);
> >> + *
> >> + * ... later, call the function: ...
> >> + *
> >> + * copied_f(args);
> >> + *
> >> + * The size of the function to be copied can't be determined from C:
> >> + * this must be determined by other means, such as adding assmbler 
> >> directives
> >> + * in the file where f is defined.
> >> + */
> >> +
> >> +#ifndef __ASM_FNCPY_H
> >> +#define __ASM_FNCPY_H
> >> +
> >> +#include 
> >> +#include 
> >> +
> >> +#include 
> >> +#include 
> >> +
> >> +/*
> >> + * Minimum alignment requirement for the source and destination addresses
> >> + * for function copying.
> >> + */
> >> +#define FNCPY_ALIGN 8
> > 
> > From now this is not arm-only, and it's possible that some architectures
> > might want to redefine it in their arch/xxx/include/asm/fncpy.h files.
> > So it will be easier for them if you'll wrap FNCPY_ALIGN here with #ifdef
> > guards.
> > 
> > By the way, compiler already has an information

Re: [PATCH v3 2/4] asm-generic: Provide a fncpy() implementation

2017-06-20 Thread Yury Norov
On Mon, Jun 19, 2017 at 06:43:48PM +0100, Russell King - ARM Linux wrote:
> On Mon, Jun 19, 2017 at 06:18:18PM +0300, Yury Norov wrote:
> > One else thing I forgot to ask - now you have the generic
> > implementation for fncpy(), so do you really need to save arm
> > version of it?
> 
> This was covered in the review of v1, which took the ARM version
> and incorrectly used it as an asm-generic implementation.
> 
> I explicitly asked Florian _not_ to copy the ARM fncpy() version
> to asm-generic because it has (surprise surprise) ARM specific
> behaviours that do not belong in a cross-architecture generic
> version.
> 
> Namely, the ARM specific behaviour that bit 0 of a code address is
> used to signal whether the code should be executed as ARM code or
> as Thumb code.
> 
> This behaviour has no meaning on other architectures (eg, x86)
> where code addresses are not 32-bit aligned.
> 
> So, suggesting that the ARM fncpy() should be used as an asm-generic
> version is completely absurd, and just because we have an asm-generic
> version also does not mean ARM should use it.
> 
> Florian's approach to providing an asm-generic version, leaving the
> ARM specific version is entirely correct and appropriate.
> 
> So, in answer to your question, yes, we need _both_ an ARM specific
> version and an asm-generic version, where the ARM specific version is
> different from the asm-generic version.  Purely because it needs
> architecture specific details.

Hi Russell, Florian,

Thanks for clarifications. Thumb bit is a good reason to save arm
version, and I completely agree with you in this. Sorry that missed it
in the v1 discussion.

> I explicitly asked Florian _not_ to copy the ARM fncpy() version
> to asm-generic because it has (surprise surprise) ARM specific
> behaviours that do not belong in a cross-architecture generic
> version.

But it seems that v3 does exactly that - copies arm with very small
changes. :) Maybe there are good reasons to have arm version exactly
how it looks now, but in general case, for me, some things that
it does are not needed. I mean checking the alignment of the source and
the type of destination. And after some headscratching I became even
more convinced that for the general case it would be much preferable
to write the fncpy() as regular function in .c file, not a macro, at
least to have the corresponding symbol in binary and let the assembler
code to call it, which is very probable.

Yury


Re: [PATCH 05/20] arm64: rename COMPAT to AARCH32_EL0 in Kconfig

2017-06-20 Thread Yury Norov
On Mon, Jun 19, 2017 at 04:58:16PM +0100, James Morse wrote:
> Hi Yury,
> 
> On 04/06/17 12:59, Yury Norov wrote:
> > From: Andrew Pinski 
> > 
> > In this patchset  ILP32 ABI support is added. Additionally to AARCH32,
> > which is binary-compatible with ARM, ILP32 is (mostly) ABI-compatible.
> > 
> > From now, AARCH32_EL0 (former COMPAT) config option means the support of
> > AARCH32 userspace, ARM64_ILP32 - support of ILP32 ABI (see next patches),
> > and COMPAT indicates that one of them, or both, is enabled.
> > 
> > Where needed, CONFIG_COMPAT is changed over to use CONFIG_AARCH32_EL0 
> > instead
> 
> Nit: You have 'COMPAT' around compat_hwcap_str's definition, but its only user
> is wrapped in 'AARCH32_EL0'.
> 
> 
> After this patch
> arch/arm64/kernel/perf_callchain.c::perf_callchain_user() still has:
> > if (!compat_user_mode(regs)) {
> > /* AARCH64 mode */
> ...
> > } else {
> > #ifdef CONFIG_COMPAT
> > /* AARCH32 compat mode */
> ...
> > #endif
> > }
> 
> I think this one should become CONFIG_AARCH32_EL0. compat to this code means 
> the
> fp is 'compat_fp' in x11, and it should read a 32bit call chain from 
> user-space.
 
Thanks, will fix it. 

> This is confusing as 'is_compat_task()' matches one of aarch32 or ilp32, but
> compat_user_mode(regs) only matches aarch32 as it checks the saved spsr. I 
> can't
> see any problem caused by this today, but its going to bite someone in the
> future. Can this be renamed aarch32_user_mode()? (turns out 'a32' is the name 
> of
> just one of aarch32's instruction sets[0].)

compat_thumb_mode may be also renamed, and compat_setup_frame and
compat_setup_rt_frame, and some others. If you think that it may
confuse, I'll do rename.

Yury


Re: [PATCH 05/20] arm64: rename COMPAT to AARCH32_EL0 in Kconfig

2017-06-22 Thread Yury Norov
On Wed, Jun 21, 2017 at 02:10:03AM +0300, Yury Norov wrote:
> On Mon, Jun 19, 2017 at 04:58:16PM +0100, James Morse wrote:
> > Hi Yury,
 
[...]

> > This is confusing as 'is_compat_task()' matches one of aarch32 or ilp32, but
> > compat_user_mode(regs) only matches aarch32 as it checks the saved spsr. I 
> > can't
> > see any problem caused by this today, but its going to bite someone in the
> > future. Can this be renamed aarch32_user_mode()? (turns out 'a32' is the 
> > name of
> > just one of aarch32's instruction sets[0].)
> 
> compat_thumb_mode may be also renamed, and compat_setup_frame and
> compat_setup_rt_frame, and some others. If you think that it may
> confuse, I'll do rename.

So this is what I found for now. I'm not sure this list is complete though.

Some aarch32 functions and macros with 'compat' in the name are
exported to userspace, so I left them untouched. Also, we derive
binfmt_elf32.c for aarch32 from general compat_binfmt, so some aarch32
functions should have 'compat' name.

Maybe it was not the best idea to reuse existing 'compat' term with new
meaning. But if we choose to introduce some new term for it, like 
'is_32_task()', we'd finish with another portion of painful renaming.

Anyway, the patch in ready. If you / anyone will find something else
to rename - just let me know.

rename compat functions:
https://github.com/norov/linux/commit/6d46b52e1dab6490076c09ddfbcd4f4821dbadae

rename compat_elf_hwcap and compat_elf_hwcap2 (I will meld with
previous one):
https://github.com/norov/linux/commit/a1d94452e14b2d7aa5b99a94f9f928f1ebe9566f

Yury


commit 6d46b52e1dab6490076c09ddfbcd4f4821dbadae
Author: Yury Norov 
Date:   Wed Jun 21 14:25:25 2017 +0300

arm64: rename functions like compat_foo() to a32_foo()

The ILP32 for ARM64 patch series introduces another 'compat' mode for
arm64. So to avoid confusing, aarc32-only functions renamed in according
to it.

diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index d668b3900b52..58762bd39314 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -126,16 +126,16 @@ struct pt_regs {
 #define arch_has_single_step() (1)
 
 #ifdef CONFIG_AARCH32_EL0
-#define compat_thumb_mode(regs) \
+#define a32_thumb_mode(regs) \
(((regs)->pstate & COMPAT_PSR_T_BIT))
 #else
-#define compat_thumb_mode(regs) (0)
+#define a32_thumb_mode(regs) (0)
 #endif
 
 #define user_mode(regs)\
(((regs)->pstate & PSR_MODE_MASK) == PSR_MODE_EL0t)
 
-#define compat_user_mode(regs) \
+#define a32_user_mode(regs)\
(((regs)->pstate & (PSR_MODE32_BIT | PSR_MODE_MASK)) == \
 (PSR_MODE32_BIT | PSR_MODE_EL0t))
 
@@ -149,10 +149,10 @@ struct pt_regs {
(!((regs)->pstate & PSR_F_BIT))
 
 #define GET_USP(regs) \
-   (!compat_user_mode(regs) ? (regs)->sp : (regs)->compat_sp)
+   (!a32_user_mode(regs) ? (regs)->sp : (regs)->compat_sp)
 
 #define SET_USP(ptregs, value) \
-   (!compat_user_mode(regs) ? ((regs)->sp = value) : ((regs)->compat_sp = 
value))
+   (!a32_user_mode(regs) ? ((regs)->sp = value) : ((regs)->compat_sp = 
value))
 
 extern int regs_query_register_offset(const char *name);
 extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
diff --git a/arch/arm64/include/asm/signal32.h 
b/arch/arm64/include/asm/signal32.h
index e68fcce538e1..0bde8a0e33c7 100644
--- a/arch/arm64/include/asm/signal32.h
+++ b/arch/arm64/include/asm/signal32.h
@@ -26,27 +26,27 @@
 
 extern const compat_ulong_t aarch32_sigret_code[6];
 
-int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set,
+int a32_setup_frame(int usig, struct ksignal *ksig, sigset_t *set,
   struct pt_regs *regs);
-int compat_setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
+int a32_setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
  struct pt_regs *regs);
 
-void compat_setup_restart_syscall(struct pt_regs *regs);
+void a32_setup_restart_syscall(struct pt_regs *regs);
 #else
 
-static inline int compat_setup_frame(int usid, struct ksignal *ksig,
+static inline int a32_setup_frame(int usid, struct ksignal *ksig,
 sigset_t *set, struct pt_regs *regs)
 {
return -ENOSYS;
 }
 
-static inline int compat_setup_rt_frame(int usig, struct ksignal *ksig, 
sigset_t *set,
+static inline int a32_setup_rt_frame(int usig, struct ksignal *ksig, sigset_t 
*set,
struct pt_regs *regs)
 {
return -ENOSYS;
 }
 
-static inline void compat_setup_restart_syscall(struct pt_regs *regs)
+static inline void a32_setup_restart_syscall(struct pt_regs *regs)
 {
 }
 #endif /* CONFIG_AARCH32_EL0 */
diff --git a/arch/arm64/kernel/a

Re: [PATCH 0/2] arm64 SMMUv3 PMU driver with IORT support

2017-10-31 Thread Yury Norov
Hi Neil,

On Fri, Aug 04, 2017 at 03:59:12PM -0400, Neil Leeder wrote:
> This adds a driver for the SMMUv3 PMU into the perf framework.
> It includes an IORT update to support PM Counter Groups.
> 
> IORT has no mechanism for determining device names so PMUs
> are named based on their physical address. 
> 
> Tested on Qualcomm QDF2400. perf_fuzzer ran for 4+ hours
> with no failures.
> 
> Neil Leeder (2):
>   acpi: arm64: add iort support for PMCG
>   perf: add arm64 smmuv3 pmu driver
> 
>  drivers/acpi/arm64/iort.c |  54 +++
>  drivers/perf/Kconfig  |   9 +
>  drivers/perf/Makefile |   1 +
>  drivers/perf/arm_smmuv3_pmu.c | 823 
> ++
>  include/acpi/actbl2.h |   9 +-
>  5 files changed, 895 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/perf/arm_smmuv3_pmu.c

I try to run your driver on ThunderX2, but perf list doesn't show new
events, and example in description in patch 2 also doesn't work:
yury@VAL1-25:~/linux$ tools/perf/perf stat -e 
smmu_0_ff88840/transaction,filter_enable=1, 
filter_span=1,filter_stream_id=0x42/ -a pwd
event syntax error: '..ter_enable=1,'
  \___ parser error
Run 'perf list' for a list of valid events 

 Usage: perf stat [] []

-e, --eventevent selector. use 'perf list' to list available 
events

I run v4.14-rc7 kernel plus this series. The config is attached. I
found that platform_match() never return 1 for arm-smmu-pmu and so
the driver never probed.

Maybe it's my local configuration issue?

Thanks for any help,
Yury


t99.config.gz
Description: application/gzip


Re: [PATCH v2] lib: optimize cpumask_next_and()

2017-10-25 Thread Yury Norov
On Tue, Oct 24, 2017 at 12:51:59PM +0200, Clement Courbet wrote:
> We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and().
> It's essentially a joined iteration in search for a non-zero bit, which
> is currently implemented as a lookup join (find a nonzero bit on the
> lhs, lookup the rhs to see if it's set there).
> 
> Implement a direct join (find a nonzero bit on the incrementally built
> join). Direct benchmarking shows that it's 1.17x to 14x faster with a
> geometric mean of 2.1 on 32 CPUs. No impact on memory usage.
> 
> Approximate benchmark code:
> 
> ```
>   unsigned long src1p[nr_cpumask_longs] = {pattern1};
>   unsigned long src2p[nr_cpumask_longs] = {pattern2};
>   for (/*a bunch of repetitions*/) {
> for (int n = -1; n <= nr_cpu_ids; ++n) {
>   asm volatile("" : "+rm"(src1p)); // prevent any optimization
>   asm volatile("" : "+rm"(src2p));
>   unsigned long result = cpumask_next_and(n, src1p, src2p);
>   asm volatile("" : "+rm"(result));
> }
>   }
> ```
> Signed-off-by: Clement Courbet 
> ---
> Changes in v2:
>  - Refactored _find_next_common_bit into _find_next_bit., as suggested
>by Yury Norov.

What I actually suggested is make _find_next_and_bit() similar to
_find_next_bit(), not to extend _find_next_bit(). But what you did
looks OK.

> This has no adverse effects on the performance side,
>as the compiler successfully inlines the code.

I think it's not about inlining, compiler just optimizes out branches known
as false at compile-time.

>  include/asm-generic/bitops/find.h   | 16 ++
>  include/linux/bitmap.h  |  2 ++
>  lib/cpumask.c   |  9 
>  lib/find_bit.c  | 37 
> +
>  tools/include/asm-generic/bitops/find.h | 16 ++
>  5 files changed, 67 insertions(+), 13 deletions(-)
> 
> diff --git a/include/asm-generic/bitops/find.h 
> b/include/asm-generic/bitops/find.h
> index 998d4d544f18..130962f3a264 100644
> --- a/include/asm-generic/bitops/find.h
> +++ b/include/asm-generic/bitops/find.h
> @@ -15,6 +15,22 @@ extern unsigned long find_next_bit(const unsigned long 
> *addr, unsigned long
>   size, unsigned long offset);
>  #endif
>  
> +#ifndef find_next_and_bit
> +/**
> + * find_next_and_bit - find the next set bit in both memory regions
> + * @addr1: The first address to base the search on
> + * @addr2: The second address to base the search on
> + * @offset: The bitnumber to start searching at
> + * @size: The bitmap size in bits
> + *
> + * Returns the bit number for the next set bit
> + * If no bits are set, returns @size.
> + */
> +extern unsigned long find_next_and_bit(const unsigned long *addr1,
> + const unsigned long *addr2, unsigned long size,
> + unsigned long offset);
> +#endif
> +
>  #ifndef find_next_zero_bit
>  /**
>   * find_next_zero_bit - find the next cleared bit in a memory region
> diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
> index 700cf5f67118..b4606bfda52f 100644
> --- a/include/linux/bitmap.h
> +++ b/include/linux/bitmap.h
> @@ -77,6 +77,8 @@
>   * find_first_bit(addr, nbits)   Position first set bit in *addr
>   * find_next_zero_bit(addr, nbits, bit)  Position next zero bit in *addr 
> >= bit
>   * find_next_bit(addr, nbits, bit)   Position next set bit in *addr >= bit
> + * find_next_and_bit(addr1, addr2, nbits, bit)   Same as find_first_bit, 
> but in
> + *   (*addr1 & *addr2)
>   */
>  
>  /*
> diff --git a/lib/cpumask.c b/lib/cpumask.c
> index 8b1a1bd77539..5602223837fa 100644
> --- a/lib/cpumask.c
> +++ b/lib/cpumask.c
> @@ -32,10 +32,11 @@ EXPORT_SYMBOL(cpumask_next);
>  int cpumask_next_and(int n, const struct cpumask *src1p,
>const struct cpumask *src2p)
>  {
> - while ((n = cpumask_next(n, src1p)) < nr_cpu_ids)
> - if (cpumask_test_cpu(n, src2p))
> - break;
> - return n;
> + /* -1 is a legal arg here. */
> + if (n != -1)
> + cpumask_check(n);
> + return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
> + nr_cpumask_bits, n + 1);
>  }
>  EXPORT_SYMBOL(cpumask_next_and);
>  
> diff --git a/lib/find_bit.c b/lib/find_bit.c
> index 6ed74f78380c..ebc08fd9fdf8 100644
> --- a/lib/find_bit.c
> +++ b/lib/find_bit.c
> @@ -24,19 +24,25 @@
>  #if !defined(find_next_bit) || !defined(find_next_zero_bit)
>  
>  /*
> - * This is a common helper

Re: Re [PATCH v2] lib: optimize cpumask_next_and()

2017-10-25 Thread Yury Norov
On Wed, Oct 25, 2017 at 05:28:41PM +0200, Clement Courbet wrote:
> Thanks for the comments Yury.
> 
> > But I'd like also to keep _find_next_bit() consistent with
> > _find_next_bit_le()
> 
> Not sure I understand what you're suggesting here: Do you want a
> find_next_and_bit_le() or do you want to make _find_next_bit_le() more
> like _find_next_bit() ? In the latter case we might just want to merge
> it with _find_next_bit() and end up with an extra is_le parameter :)

Both ways will work, but I think that extra is_le is too much.
_find_next_bit_le() should be the copy of _find_next_bit() with the
addition of swapping code.

If you don't need find_next_and_bit_le(), don't add it.
find_{first,last}_bit() doesn't have LE version, for example.

Yury


Re: Re [PATCH v2] lib: optimize cpumask_next_and()

2017-10-25 Thread Yury Norov
On Wed, Oct 25, 2017 at 06:50:14PM +0300, Yury Norov wrote:
> On Wed, Oct 25, 2017 at 05:28:41PM +0200, Clement Courbet wrote:
> > Thanks for the comments Yury.
> > 
> > > But I'd like also to keep _find_next_bit() consistent with
> > > _find_next_bit_le()
> > 
> > Not sure I understand what you're suggesting here: Do you want a
> > find_next_and_bit_le() or do you want to make _find_next_bit_le() more
> > like _find_next_bit() ? In the latter case we might just want to merge
> > it with _find_next_bit() and end up with an extra is_le parameter :)
> 
> Both ways will work, but I think that extra is_le is too much.
> _find_next_bit_le() should be the copy of _find_next_bit() with the
> addition of swapping code.
> 
> If you don't need find_next_and_bit_le(), don't add it.
> find_{first,last}_bit() doesn't have LE version, for example.

Few comments more.

_find_next_bit is now referenced also by find_next_and_bit(), and
so it should be added to guard code:

#if !defined(find_next_bit) || !defined(find_next_zero_bit)
|| !defined(find_next_and_bit)
static unsigned long _find_next_bit( ... )
{
...
}
#endif

It may be essential at least for arm.

Don't forget to synchronize your changes with
tools/lib/find_bit.c

Thanks,
Yury


Re: [PATCH] lib: use correct format string for find-bit tests

2017-11-14 Thread Yury Norov
On Mon, Nov 13, 2017 at 02:55:45PM +0100, Arnd Bergmann wrote:
> The cycles_t definition is architecture specific, which causes
> a link error on all architectures that use a 'long long' or 'int'
> type for it:
> 
> lib/test_find_bit.c: In function 'test_find_last_bit':
> include/linux/kern_levels.h:5:18: error: format '%ld' expects argument of 
> type 'long int', but argument 2 has type 'cycles_t {aka long long unsigned 
> int}' [-Werror=format=]
> 
> This adds an explicit cast to 'u64' for it, which lets us use
> '%llu' everywhere.
> 
> Fixes: 09588b1f1d58 ("lib: test module for find_*_bit() functions")
> Signed-off-by: Arnd Bergmann 

Hi Arnd,

patch looks OK. Thank you.

Acked-by: Yury Norov 


Re: [PATCH] lib: test module for find_*_bit() functions

2017-11-14 Thread Yury Norov
Hi Alexey, Andrew,

Thanks for comments.

On Fri, Nov 10, 2017 at 12:45:18PM +0200, Alexey Dobriyan wrote:
> On 11/10/17, Andrew Morton  wrote:
> > On Thu,  9 Nov 2017 17:07:14 +0300 Yury Norov 
> > wrote:
> >
> >> find_bit functions are widely used in the kernel, including hot paths.
> >> This module tests performance of that functions in 2 typical scenarios:
> >> randomly filled bitmap with relatively equal distribution of set and
> >> cleared bits, and sparse bitmap which has 1 set bit for 500 cleared bits.
> >>
> >> ...
> >>
> >> +config TEST_FIND_BIT
> >
> > Well.  It doesn't actually "test" the code.  It measures its performance ;)
> 
> Yes!
> 
> Yyra, you can grab CONFIG_BENCHMARK_* namespace :-)
 
There's no CONFIG_BENCHMARK_* namespace actually. The 'CONFIG_*_BENCHMARK' is
referenced only 3 times in linux sources - CONFIG_RING_BUFFER_BENCHMARK,
CONFIG_TRACEPOINT_BENCHMARK and CONFIG_GUP_BENCHMARK, so I simply didn't know
about it. Some other tests like lib/rbtree_test.c also measure performance and
use TEST namespace, but if you think it's better, I don't object to change it.
 
> Another thing:
> 
> > +
> > +   return 0;
> > +}
> > +module_init(find_bit_test);
> > +
> > +static void __exit test_find_bit_cleanup(void)
> > +{
> > +}
> > +module_exit(test_find_bit_cleanup);
> 
> module exit hook is entirely unnecessary as you can return -E from init hook.
> See lib/test-kstrtox.c

Ack. 

I thought to send v3, but the patch is already in next tree, so I'll
send fix in separated patch. OK?

Yury


Re: [PATCH] lib: test module for find_*_bit() functions

2017-11-14 Thread Yury Norov
Hi Michael,

On Sun, Nov 12, 2017 at 10:33:55PM +1100, Michael Ellerman wrote:
> Yury Norov  writes:
> 
> > find_bit functions are widely used in the kernel, including hot paths.
> > This module tests performance of that functions in 2 typical scenarios:
> > randomly filled bitmap with relatively equal distribution of set and
> > cleared bits, and sparse bitmap which has 1 set bit for 500 cleared bits.
> >
> > On ThunderX machine:
> >
> >  Start testing find_bit() with random-filled bitmap
> > [1032111.632383] find_next_bit: 240043 cycles,  164062 
> > iterations
> > [1032111.647236] find_next_zero_bit:312848 cycles,  163619 
> > iterations
> > [1032111.661585] find_last_bit: 193748 cycles,  164062 
> > iterations
> > [1032113.450517] find_first_bit:177720874 cycles,   164062 
> > iterations
> > [1032113.462930]
> >  Start testing find_bit() with sparse bitmap
> > [1032113.477229] find_next_bit: 3633 cycles,656 iterations
> > [1032113.494281] find_next_zero_bit:620399 cycles,  327025 
> > iterations
> > [1032113.506723] find_last_bit: 3038 cycles,656 iterations
> > [1032113.524485] find_first_bit:691407 cycles,  656 iterations
> 
> Have you thought about timing it rather than using get_cycles()?
> 
> get_cycles() has the downside that it can't be compared across different
> architectures or even platforms within an architecture.

This test is written to benchmark find_bit() on the same target if algorithm
is changed. Comparing different architectures looks problematic anyway.

Different architectures may have different clock rates, and even implementations
of the function, like ARM. And many CPUs support dynamic changing of CPU speed
which will directly affect time of execution. So I don't think that direct
comparison of time across platforms would be informative without additional
precautions.

Also, other tests, like lib/interval_tree_test.c or lib/rbtree_test.c print
get_cycles() where they need to estimate performance, and it looks like common
practice.

Do you have real usecase for it?

Yury


next-20171102: ARM64 dies on boot

2017-11-02 Thread Yury Norov
  ldpx21, x22, [sp, #32]  
  │
   │0x08dd8cec  ldpx23, x24, [sp, #48]  
  │
   │0x08dd8cf0  ldpx29, x30, [sp], #80  
  │
   │0x08dd8cf4  ret 
  |

This is very early stage, so there's no messages in console.
Config is attached. If no ideas, I can bisect it later.

Yury


config.next.gz
Description: application/gzip


Re: [PATCH 08/15] arm64: don't pass -maarch64linux to GNU gold

2017-11-03 Thread Yury Norov
Hi Sami,

Very interesting reading, thank you.

On Fri, Nov 03, 2017 at 10:11:53AM -0700, Sami Tolvanen wrote:
> This change fixes the following error message when linking with GNU
> gold:
> 
>   ld.gold: error: unrecognized emulation aarch64linux
> 
> Signed-off-by: Sami Tolvanen 
> ---
>  arch/arm64/Makefile | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
> index eb6f3c9ec6cb..c16bd1ab37f8 100644
> --- a/arch/arm64/Makefile
> +++ b/arch/arm64/Makefile
> @@ -64,14 +64,18 @@ KBUILD_CPPFLAGS   += -mbig-endian
>  CHECKFLAGS   += -D__AARCH64EB__
>  AS   += -EB
>  LD   += -EB
> +ifneq ($(ld-name),gold)
>  LDFLAGS  += -maarch64linuxb
> +endif
>  UTS_MACHINE  := aarch64_be
>  else
>  KBUILD_CPPFLAGS  += -mlittle-endian
>  CHECKFLAGS   += -D__AARCH64EL__
>  AS   += -EL
>  LD   += -EL
> +ifneq ($(ld-name),gold)
>  LDFLAGS  += -maarch64linux
> +endif
>  UTS_MACHINE  := aarch64
>  endif

-maarch64linux was added to LDFLAGS, and -mabi=lp64 was added to CFLAGS
at the same patch to ensure that kernel will be compiled and linked in
lp64 mode, even if toolchain by default compiles and links targets in
ilp32 mode. So I think that simple removing this flag looks inaccurate.

Also, IIUC, this patch is not related to LTO, because if ld.gold doesn't
recognize -maarch64linux with LTO, it will not recognize it in non-LTO
build. Am I right?

I think that more correct way to do it would be either:
 - add maarch64linux to ld.gold, if possible. In discussion to other
   patches people talk that they need very fresh clang to build kernel
   properly, so this may be considered as similar issue;
 - if ld.gold understands some synonyms like -mabi=lp64, it should be
   passed to LDFLAGS here;
 - if ld.gold can link only lp64 objects, it should be commented here.
   But in this case I don't understand for example how to build vdso
   for ilp32 userspace...

Thanks,
Yury


[PATCH] lib: hint GCC to inlilne _find_next_bit() helper

2017-10-28 Thread Yury Norov
Hi all,

It seems that inlining the _find_next_bit() helper makes
find_next_bit() and find_next_zero_bit() 2 times faster at
the scenario of finding all set/cleared bits of randomly
initialised bitmap.

For another typical scenario of traversing sparse bitmap
there is also measurable improvement observed, about 15%.

The increasing of text size of find_bit.o module is 40 bytes
for arm64 - from 252 to 292 bytes - is looking acceptable.

This patch also contains test module.

Measured on ThunderX machine. Tests for other architectures are
very appreciated.

Before:
[   96.856195] Start testing find_bit() with random-filled bitmap
[   96.868322] find_next_bit: 34529 cycles, 16304 iterations
[   96.879525] find_next_zero_bit: 35771 cycles, 16465 iterations
[   96.891409] find_last_bit: 17444 cycles, 16304 iterations
[   96.914445] find_first_bit: 1219671 cycles, 16305 iterations
[   96.925802] Start testing find_bit() with sparse bitmap
[   96.936308] find_next_bit: 301 cycles, 66 iterations
[   96.946981] find_next_zero_bit: 70897 cycles, 32703 iterations
[   96.958694] find_last_bit: 286 cycles, 66 iterations
[   96.968710] find_first_bit: 5260 cycles, 66 iterations

After:
[  169.464229] Start testing find_bit() with random-filled bitmap
[  169.476191] find_next_bit: 17520 cycles, 16336 iterations
[  169.487210] find_next_zero_bit: 17622 cycles, 16433 iterations
[  169.499111] find_last_bit: 19272 cycles, 16335 iterations
[  169.519735] find_first_bit: 978657 cycles, 16337 iterations
[  169.530912] Start testing find_bit() with sparse bitmap
[  169.541414] find_next_bit: 252 cycles, 66 iterations
[  169.551726] find_next_zero_bit: 34554 cycles, 32703 iterations
[  169.563436] find_last_bit: 294 cycles, 66 iterations
[  169.573439] find_first_bit: 3964 cycles, 66 iterations

CC: Alexey Dobriyan 
CC: Andrew Morton 
CC: Clement Courbet 
CC: Matthew Wilcox 
CC: Rasmus Villemoes 
Signed-off-by: Yury Norov 
---
 lib/Kconfig.debug|   9 
 lib/Makefile |   1 +
 lib/find_bit.c   |   2 +-
 lib/test_find_bit.c  | 141 +++
 tools/lib/find_bit.c |   2 +-
 5 files changed, 153 insertions(+), 2 deletions(-)
 create mode 100644 lib/test_find_bit.c

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index dfdad67d8f6c..138034cc68a3 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1838,6 +1838,15 @@ config TEST_BPF
 
  If unsure, say N.
 
+config TEST_FIND_BIT
+   tristate "Test find_bit functions"
+   default n
+   help
+ This builds the "test_find_bit" module that measure find_*_bit()
+ functions performance.
+
+ If unsure, say N.
+
 config TEST_FIRMWARE
tristate "Test firmware loading via userspace interface"
default n
diff --git a/lib/Makefile b/lib/Makefile
index dafa79613fb4..edb792b42c86 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -45,6 +45,7 @@ obj-y += hexdump.o
 obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o
 obj-y += kstrtox.o
 obj-$(CONFIG_TEST_BPF) += test_bpf.o
+obj-$(CONFIG_TEST_FIND_BIT) += test_find_bit.o
 obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
 obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o
 obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 6ed74f78380c..9b0c89f3fd3a 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -28,7 +28,7 @@
  * find_next_zero_bit.  The difference is the "invert" argument, which
  * is XORed with each fetched word before searching it for one bits.
  */
-static unsigned long _find_next_bit(const unsigned long *addr,
+static inline unsigned long _find_next_bit(const unsigned long *addr,
unsigned long nbits, unsigned long start, unsigned long invert)
 {
unsigned long tmp;
diff --git a/lib/test_find_bit.c b/lib/test_find_bit.c
new file mode 100644
index ..8eaf10cae214
--- /dev/null
+++ b/lib/test_find_bit.c
@@ -0,0 +1,141 @@
+/*
+ * Test for find_*_bit functions.
+ *
+ * Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/*
+ * find_bit functions are widely used in kernel, so the successful boot
+ * is good enough test for correctness.
+ *
+ * This test is focused on performance of traversing bitmaps. Two typical
+ * scenarios are reproduced:
+ * - randomly filled bitmap with approximately equal number of set and
+ *   cleared bits;
+ * - sparse bitmap with few set bits at random positions.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#includ

Re: [PATCH v3] lib: optimize cpumask_next_and()

2017-10-28 Thread Yury Norov
On Thu, Oct 26, 2017 at 02:58:00PM +0200, Alexey Dobriyan wrote:
> >  - Refactored _find_next_common_bit into _find_next_bit., as suggested
> >by Yury Norov. This has no adverse effects on the performance side,
> >as the compiler successfully inlines the code.
> 
> 1)
> Gentoo ships 5.4.0 which doesn't inline this code on x86_64 defconfig
> (which has OPTIMIZE_INLINING).
> 
> 
> 813556c0 :
> 813556c0:   55  push   rbp
> 813556c1:   48 89 d1movrcx,rdx
> 813556c4:   45 31 c0xorr8d,r8d
> 813556c7:   48 89 f2movrdx,rsi
> 813556ca:   31 f6   xoresi,esi
> 813556cc:   48 89 e5movrbp,rsp
> 813556cf:   e8 7c ff ff ff  call
> 81355650 <_find_next_bit>
> 813556d4:   5d  poprbp
> 813556d5:   c3  ret

GCC 7 for ARM64 doesn't inline as well. I wrote test for it to measure
the effect of inlining:
http://www.spinics.net/lists/kernel/msg2635338.html

The performance impact of this patch without inlining: 

Before:
[   96.856195] Start testing find_bit() with random-filled bitmap
[   96.868322] find_next_bit: 34529 cycles, 16304 iterations
[   96.879525] find_next_zero_bit: 35771 cycles, 16465 iterations
[   96.891409] find_last_bit: 17444 cycles, 16304 iterations
[   96.914445] find_first_bit: 1219671 cycles, 16305 iterations
[   96.925802] Start testing find_bit() with sparse bitmap
[   96.936308] find_next_bit: 301 cycles, 66 iterations
[   96.946981] find_next_zero_bit: 70897 cycles, 32703 iterations
[   96.958694] find_last_bit: 286 cycles, 66 iterations
[   96.968710] find_first_bit: 5260 cycles, 66 iterations

After:
[  116.205594] Start testing find_bit() with random-filled bitmap
[  116.217621] find_next_bit: 24979 cycles, 16449 iterations
[  116.228719] find_next_zero_bit: 25666 cycles, 16320 iterations
[  116.240620] find_last_bit: 19407 cycles, 16449 iterations
[  116.268368] find_first_bit: 1690945 cycles, 16449 iterations
[  116.279718] Start testing find_bit() with sparse bitmap
[  116.290219] find_next_bit: 352 cycles, 66 iterations
[  116.300692] find_next_zero_bit: 50916 cycles, 32703 iterations
[  116.312400] find_last_bit: 295 cycles, 66 iterations
[  116.322427] find_first_bit: 6742 cycles, 66 iterations

And with inlining:

Before:
[  169.464229] Start testing find_bit() with random-filled bitmap
[  169.476191] find_next_bit: 17520 cycles, 16336 iterations
[  169.487210] find_next_zero_bit: 17622 cycles, 16433 iterations
[  169.499111] find_last_bit: 19272 cycles, 16335 iterations
[  169.519735] find_first_bit: 978657 cycles, 16337 iterations
[  169.530912] Start testing find_bit() with sparse bitmap
[  169.541414] find_next_bit: 252 cycles, 66 iterations
[  169.551726] find_next_zero_bit: 34554 cycles, 32703 iterations
[  169.563436] find_last_bit: 294 cycles, 66 iterations
[  169.573439] find_first_bit: 3964 cycles, 66 iterations

After
[  191.191170] Start testing find_bit() with random-filled bitmap
[  191.203133] find_next_bit: 17530 cycles, 16346 iterations
[  191.214150] find_next_zero_bit: 17630 cycles, 16423 iterations
[  191.226037] find_last_bit: 17489 cycles, 16347 iterations
[  191.246672] find_first_bit: 979961 cycles, 16347 iterations
[  191.257849] Start testing find_bit() with sparse bitmap
[  191.268351] find_next_bit: 257 cycles, 66 iterations
[  191.278660] find_next_zero_bit: 34547 cycles, 32703 iterations
[  191.290370] find_last_bit: 292 cycles, 66 iterations
[  191.300376] find_first_bit: 4269 cycles, 66 iterations

I didn't investigate why non-inlined version of this patch works
faster than vanilla code, but inlined one is even faster and is
as fast as inlined version of existing code. I think, we should
come with it finally.

It would be great if someone test it on x86.
 
> 2)
> Making "and" operation to be centerpiece of this code is kind of meh
> find_next_or_bit() will be hard to implement.

Not so hard actually. :)
https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1521775.html

Yury


Re: [PATCH] Fix line too long warning

2017-10-29 Thread Yury Norov
Hi Kien,

On Sat, Oct 28, 2017 at 10:46:13PM -0400, Kien Ha wrote:
> >From fc52a98aca0c033f2c03fdc7e8f83ae49625675a Mon Sep 17 00:00:00 2001
> From: Kien Ha 
> Date: Fri, 27 Oct 2017 14:07:55 -0400
> Subject: [PATCH] Fix line too long warning
> 
> Signed-off-by: Kien Ha 
> ---
>  drivers/staging/rtlwifi/base.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/staging/rtlwifi/base.c b/drivers/staging/rtlwifi/base.c
> index b88b0e8edd3d..bbc80f976e12 100644
> --- a/drivers/staging/rtlwifi/base.c
> +++ b/drivers/staging/rtlwifi/base.c
> @@ -1283,7 +1283,8 @@ void rtl_get_tcb_desc(struct ieee80211_hw *hw,
>   } else {
>   if (rtlmac->mode == WIRELESS_MODE_B) {
>   tcb_desc->hw_rate =
> - 
> rtlpriv->cfg->maps[RTL_RC_CCK_RATE11M];
> + rtlpriv->cfg->maps[
> + RTL_RC_CCK_RATE11M];

At first, if you fix this, you should also fix similar problem 3 lines
below, right?

>   } else {
>   tcb_desc->hw_rate =
>   
> rtlpriv->cfg->maps[RTL_RC_OFDM_RATE54M];

At second, and most important, refer Documentation/process/coding-style.rst:
Now, some people will claim that having 8-character indentations makes
the code move too far to the right, and makes it hard to read on a 
80-character terminal screen.  The answer to that is that if you need 
more than 3 levels of indentation, you're screwed anyway, and should fix
your program.

The real problem here is not "line too long", but "indentation level too
big" - 5. And it worth to address real problem.

Yury


Re: [PATCH] Fix line too long warning

2017-10-29 Thread Yury Norov
On Sun, Oct 29, 2017 at 06:54:09PM +0300, Yury Norov wrote:
> Hi Kien,
> 
> On Sat, Oct 28, 2017 at 10:46:13PM -0400, Kien Ha wrote:
> > >From fc52a98aca0c033f2c03fdc7e8f83ae49625675a Mon Sep 17 00:00:00 2001
> > From: Kien Ha 
> > Date: Fri, 27 Oct 2017 14:07:55 -0400
> > Subject: [PATCH] Fix line too long warning
> > 
> > Signed-off-by: Kien Ha 
> > ---
> >  drivers/staging/rtlwifi/base.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/staging/rtlwifi/base.c b/drivers/staging/rtlwifi/base.c
> > index b88b0e8edd3d..bbc80f976e12 100644
> > --- a/drivers/staging/rtlwifi/base.c
> > +++ b/drivers/staging/rtlwifi/base.c
> > @@ -1283,7 +1283,8 @@ void rtl_get_tcb_desc(struct ieee80211_hw *hw,
> > } else {
> > if (rtlmac->mode == WIRELESS_MODE_B) {
> > tcb_desc->hw_rate =
> > -   
> > rtlpriv->cfg->maps[RTL_RC_CCK_RATE11M];
> > +   rtlpriv->cfg->maps[
> > +   RTL_RC_CCK_RATE11M];
> 
> At first, if you fix this, you should also fix similar problem 3 lines
> below, right?
> 
> > } else {
> > tcb_desc->hw_rate =
> > 
> > rtlpriv->cfg->maps[RTL_RC_OFDM_RATE54M];
> 
> At second, and most important, refer Documentation/process/coding-style.rst:
> Now, some people will claim that having 8-character indentations makes
> the code move too far to the right, and makes it hard to read on a 
> 80-character terminal screen.  The answer to that is that if you need 
> more than 3 levels of indentation, you're screwed anyway, and should fix
> your program.
> 
> The real problem here is not "line too long", but "indentation level too
> big" - 5. And it worth to address real problem.

It's not so hard though, something like this:

void rtl_get_tcb_desc(struct ieee80211_hw *hw,
  struct ieee80211_tx_info *info,
  struct ieee80211_sta *sta,
  struct sk_buff *skb, struct rtl_tcb_desc *tcb_desc)
{
#define SET_RATE_ID(rate_id)\
((rtlpriv->cfg->spec_ver & RTL_SPEC_NEW_RATEID) ?   \
rtl_mrate_idx_to_arfr_id(hw, rate_id,   \
(sta_entry ? sta_entry->wireless_mode : \
 WIRELESS_MODE_G)) :\
rate_id)

struct rtl_priv *rtlpriv = rtl_priv(hw);
struct rtl_mac *rtlmac = rtl_mac(rtl_priv(hw));
struct ieee80211_hdr *hdr = rtl_get_hdr(skb);
struct rtl_sta_info *sta_entry =
(sta ? (struct rtl_sta_info *)sta->drv_priv : NULL);

__le16 fc = rtl_get_fc(skb);

tcb_desc->hw_rate = _rtl_get_tx_hw_rate(hw, info);

if (rtl_is_tx_report_skb(hw, skb))
tcb_desc->use_spe_rpt = 1;

if (!ieee80211_is_data(fc)) {
tcb_desc->use_driver_rate = true;
tcb_desc->ratr_index = SET_RATE_ID(RATR_INX_WIRELESS_MC);
tcb_desc->disable_ratefallback = 1;
tcb_desc->mac_id = 0;
tcb_desc->packet_bw = false;
return;
}

if (is_multicast_ether_addr(hdr->addr1))
tcb_desc->multicast = 1;
else if (is_broadcast_ether_addr(hdr->addr1))
tcb_desc->broadcast = 1;

/*
 * we set data rate INX 0 in rtl_rc.c if skb is special data or
 * mgt which need low data rate. So tcb_desc->hw_rate is just used
 * for special data and mgt frames
 */
if (info->control.rates[0].idx == 0 || ieee80211_is_nullfunc(fc)) {
tcb_desc->use_driver_rate = true;
tcb_desc->ratr_index = SET_RATE_ID(RATR_INX_WIRELESS_MC);
tcb_desc->disable_ratefallback = 1;
goto rtl_query;
}

/* because hw will never use hw_rate
 * when tcb_desc->use_driver_rate = false
 * so we never set highest N rate here,
 * and N rate will all be controlled by FW
 * when tcb_desc->use_driver_rate = false
 */
if (sta && sta->vht_cap.vht_supported) {
tcb_desc->hw_rate = _rtl_get_vht_highest_n_rate(hw, sta);
goto rtl_query;
}

if (sta && (sta->ht_cap.ht_supported)) {
tcb_desc->hw_rate = _rtl_get_highest_n

Re: [PATCH] Fix line too long warning

2017-10-29 Thread Yury Norov
On Sun, Oct 29, 2017 at 10:28:27AM -0700, Joe Perches wrote:
> On Sun, 2017-10-29 at 18:54 +0300, Yury Norov wrote:
> > At second, and most important, refer Documentation/process/coding-style.rst:
> > Now, some people will claim that having 8-character indentations makes
> > the code move too far to the right, and makes it hard to read on a 
> > 80-character terminal screen.  The answer to that is that if you need 
> > more than 3 levels of indentation, you're screwed anyway, and should fix
> > your program.
> > 
> > The real problem here is not "line too long", but "indentation level too
> > big" - 5. And it worth to address real problem.
> 
> Line length issues can be a combination of several factors:
> 
> o identifier length
> o quantity of dereferences
> o indentation depth
> o code complexity
> 
> 4 indentation depth levels are not a real issue.
> A significant percentage of lines in the kernel
> are 4 or more tab indent levels deep.
> 
> checkpatch suggests that 6 or more is the depth level
> that should cause real concern.
> 
> Here's a little breakdown of lines that start with
> a tab followed by a c90 keyword in the kernel
> 
> $ git grep -P 
> "^\t+(if|for|do|while|\}|else|switch|return|case|break|continue|goto)\b" -- 
> "*.[ch]" | \
>   cut -f2- -d":" | perl -p -e 's/(^\t+).*/\1/' | \
>   sort | uniq -c | sort -rn | \
>   awk '{total += $1; count[i++] = $1} END { for (j = 0; j < i; j++) { printf 
> "%d\t%d\t%.2f%%\n", j + 1, count[j], count[j] / total * 100 }}'
> 1 1325462 52.19%
> 2 863007  33.98%
> 3 271844  10.70%
> 4 64009   2.52%
> 5 12502   0.49%
> 6 21990.09%
> 7 501 0.02%
> 8 166 0.01%
> 9 51  0.00%
> 1020  0.00%
> 1110  0.00%
> 124   0.00%
> 131   0.00%
> 
> I think it could reasonably be argued that the
> indentation depth warning (DEEP_INDENTATION)
> should start at 5 and not at 6.
> 
> ---
>  scripts/checkpatch.pl | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
> index 6bdd43d5dec5..923e4ff09d24 100755
> --- a/scripts/checkpatch.pl
> +++ b/scripts/checkpatch.pl
> @@ -3353,7 +3353,7 @@ sub process {
>  
>   my ($level, @ctx) = ctx_statement_level($linenr, 
> $realcnt, 0);
>  
> - if ($line =~ /^\+\t{6,}/) {
> + if ($line =~ /^\+\t{5,}/) {
>   WARN("DEEP_INDENTATION",
>"Too many leading tabs - consider code 
> refactoring\n" . $herecurr);
>   }

There are 2 different subjects here - this specific function in staging
driver and general considerations.

Regarding the function, it has very simple structure, and so deep indentation
level is definitely a symptom of bad design.

Regarding general considerations, the kernel is very complex, and I admit
that indentations deeper than 3 are unavoidable in some cases.

But we have coding style that insists on 3, and this rule is (probably ?)
written by Linus, so... :-)

Nevertheless, your patch is the step toward right direction, so if you
need my ack,
Acked-by: Yury Norov 


Re: [PATCH v5 00/23] ILP32 for ARM64

2015-10-05 Thread Yury Norov
On Fri, Oct 02, 2015 at 12:49:46AM +0300, Pinski, Andrew wrote:

[...]

> Ok, we will rewrite these patches using 32bit time_t and 32bit off_t and
> redo the toolchain support for them.  Note this is going back to the abi
> I had originally done when I submitted my original version when it was
> asked to change time_t to be 64bit. 
> 
> Thanks,
> Andrew

Hi Andrew,

I try to apply your glibc ILP32 patchset on current glibc master, but
there're many merge failures.
https://sourceware.org/ml/libc-alpha/2014-10/msg00596.html

Could you share more fresh version of it, if you have one?

Are there any special commands or configure options needed 
to enable ILP32 properly?

BR,
Yury.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 10/23] arm64: introduce is_a32_task and is_a32_thread (for AArch32 compat)

2015-10-07 Thread Yury Norov
On Tue, Oct 06, 2015 at 12:21:33PM +0300, Andrey Konovalov wrote:
> Hi Yury,
> 
> With this patch set 4.3 kernel fails to build with 
> arch/arm64/configs/defconfig plus CONFIG_AARCH32_EL0=y
> and CONFIG_ARM64_ILP32=y giving the following three errors (the 3d one is 
> warning actually):

[...]

> Thanks,
> Andrey

Hi, Andrey,

Thank you. If you're interested in early testing, you can pull
my working branch here:

https://github.com/norov/linux/tree/ilp

There, build for all combinations of CONFIG_AARCH32_EL0 and
CONFIG_ARM64_ILP32 is fixed.

I can also share my testing tools, if needed.

BR,
Yury.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/2] arm64: introduce run-time detection of aarch32 support

2015-09-01 Thread Yury Norov
This is needed to avoid loading aarch32 binaries if COMPAT is enabled, but
aarch32 is not supported by specific platform.

First patch fixes hidden header dependencies in 
arch/arm64/include/asm/cpufeature.h.
It's not related to the issue, but helps to avoid build failure that happens
if one applies second patch only.

Second patch adds run-time detection of aarch32 support, and rejects kernel to
load such binaries, if not supported.

Tested on ThunderX.

Signed-off-by: Yury Norov 

Yury Norov (2):
  arm64: cpufeature.h: resolve hidden header dependencies
  arm64: don't load 32-bit binaries if platform has no aarch32_el0

 arch/arm64/include/asm/cpufeature.h | 6 ++
 arch/arm64/include/asm/elf.h| 6 --
 arch/arm64/kernel/cpuinfo.c | 9 +
 3 files changed, 19 insertions(+), 2 deletions(-)

-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] arm64: cpufeature.h: resolve hidden header dependencies

2015-09-01 Thread Yury Norov
Functions implemented in cpufeature.h depend on some headers, but
cpufeature.h does not include them. This may cause build failure if
cpufeature.h user does not include that headers by itself. (Like it
happens in next patch of this series.)

Signed-off-by: Yury Norov 
---
 arch/arm64/include/asm/cpufeature.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/include/asm/cpufeature.h 
b/arch/arm64/include/asm/cpufeature.h
index c104421..20cdc26 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -30,6 +30,11 @@
 
 #ifndef __ASSEMBLY__
 
+#include 
+#include 
+#include 
+#include 
+
 struct arm64_cpu_capabilities {
const char *desc;
u16 capability;
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] arm64: don't load 32-bit binaries if platform has no aarch32_el0

2015-09-01 Thread Yury Norov
Kernel option COMPAT defines the ability of executing aarch32 binaries.
Some platforms does not support aarch32 mode, and so cannot execute that
binaries. But we cannot just disable COMPAT for them because the same
kernel binary may be used by multiple platforms.

In this patch, system_supports_aarch32_el0() is introduced to detect
aarch32 support at run-time.

Signed-off-by: Yury Norov 
---
 arch/arm64/include/asm/cpufeature.h | 1 +
 arch/arm64/include/asm/elf.h| 6 --
 arch/arm64/kernel/cpuinfo.c | 9 +
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h 
b/arch/arm64/include/asm/cpufeature.h
index 20cdc26..d24ea15 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -81,6 +81,7 @@ void check_local_cpu_errata(void);
 void check_local_cpu_features(void);
 bool cpu_supports_mixed_endian_el0(void);
 bool system_supports_mixed_endian_el0(void);
+bool system_supports_aarch32_el0(void);
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index faad6df..461897b 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -21,6 +21,7 @@
 /*
  * ELF register definitions..
  */
+#include 
 #include 
 #include 
 
@@ -173,8 +174,9 @@ typedef compat_elf_greg_t   
compat_elf_gregset_t[COMPAT_ELF_NGREG];
 
 /* AArch32 EABI. */
 #define EF_ARM_EABI_MASK   0xff00
-#define compat_elf_check_arch(x)   (((x)->e_machine == EM_ARM) && \
-((x)->e_flags & EF_ARM_EABI_MASK))
+#define compat_elf_check_arch(x)   (system_supports_aarch32_el0()  \
+   && ((x)->e_machine == EM_ARM)   \
+   && ((x)->e_flags & EF_ARM_EABI_MASK))
 
 #define compat_start_threadcompat_start_thread
 #define COMPAT_SET_PERSONALITY(ex) set_thread_flag(TIF_32BIT);
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 75d5a86..95d953f 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -79,6 +79,15 @@ bool system_supports_mixed_endian_el0(void)
return mixed_endian_el0;
 }
 
+#define AARCH641
+#define AARCH32_64 2
+bool system_supports_aarch32_el0(void)
+{
+   struct cpuinfo_arm64 *info = this_cpu_ptr(&cpu_data);
+   u64 arm64_el0 = info->reg_id_aa64pfr0 & 0xf;
+   return arm64_el0 == AARCH32_64;
+}
+
 static void update_mixed_endian_el0_support(struct cpuinfo_arm64 *info)
 {
mixed_endian_el0 &= 
id_aa64mmfr0_mixed_endian_el0(info->reg_id_aa64mmfr0);
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 0/2] arm64: introduce run-time detection of aarch32 support

2015-09-02 Thread Yury Norov
This is needed to avoid loading aarch32 binaries if COMPAT is enabled, but
aarch32 is not supported by specific platform.

First patch fixes hidden header dependencies in 
arch/arm64/include/asm/cpufeature.h.
It's not related to the issue, but helps to avoid build failure that happens
if one applies second patch only.

Second patch adds run-time detection of aarch32 support, and rejects kernel to
load such binaries, if not supported.

Tested on ThunderX.

V2:
 - add missing  for the __attribute_const__ on
   cpuid_feature_extract_field;
 - move cpu_feature macro under the __ASSEMBLY__ guard.
 - check that all CPUs support AArch32, not the current only,
   the same way as for endianness support.

Signed-off-by: Yury Norov 

Yury Norov (2):
  arm64: cpufeature.h: resolve hidden header dependencies
  arm64: don't load 32-bit binaries if platform has no aarch32_el0

 arch/arm64/include/asm/cpufeature.h | 10 +-
 arch/arm64/include/asm/cputype.h|  8 
 arch/arm64/include/asm/elf.h|  6 --
 arch/arm64/kernel/cpuinfo.c | 12 
 4 files changed, 33 insertions(+), 3 deletions(-)

-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 2/2] arm64: don't load 32-bit binaries if platform has no aarch32_el0

2015-09-02 Thread Yury Norov
Kernel option COMPAT defines the ability of executing aarch32 binaries.
Some platforms does not support aarch32 mode, and so cannot execute that
binaries. But we cannot just disable COMPAT for them because the same
kernel binary may be used by multiple platforms.

In this patch, system_supports_aarch32_el0() is introduced to detect
aarch32 support at run-time.

Signed-off-by: Yury Norov 
---
 arch/arm64/include/asm/cpufeature.h |  1 +
 arch/arm64/include/asm/cputype.h|  8 
 arch/arm64/include/asm/elf.h|  6 --
 arch/arm64/kernel/cpuinfo.c | 12 
 4 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h 
b/arch/arm64/include/asm/cpufeature.h
index f0e4017..35f2654 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -83,6 +83,7 @@ void check_local_cpu_errata(void);
 void check_local_cpu_features(void);
 bool cpu_supports_mixed_endian_el0(void);
 bool system_supports_mixed_endian_el0(void);
+bool system_supports_aarch32_el0(void);
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index a84ec60..a24e42c 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -81,6 +81,9 @@
 #define ID_AA64MMFR0_BIGEND(mmfr0) \
(((mmfr0) & ID_AA64MMFR0_BIGEND_MASK) >> ID_AA64MMFR0_BIGEND_SHIFT)
 
+#define ID_AA64PFR0_EL0_64 1
+#define ID_AA64PFR0_EL0_6432   2
+
 #define SCTLR_EL1_CP15BEN  (0x1 << 5)
 #define SCTLR_EL1_SED  (0x1 << 8)
 
@@ -116,6 +119,11 @@ static inline u32 __attribute_const__ 
read_cpuid_cachetype(void)
return read_cpuid(CTR_EL0);
 }
 
+static inline bool id_aa64pfr0_aarch32_el0(u64 pfr0)
+{
+   return pfr0 == ID_AA64PFR0_EL0_6432;
+}
+
 static inline bool id_aa64mmfr0_mixed_endian_el0(u64 mmfr0)
 {
return (ID_AA64MMFR0_BIGEND(mmfr0) == 0x1) ||
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index faad6df..461897b 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -21,6 +21,7 @@
 /*
  * ELF register definitions..
  */
+#include 
 #include 
 #include 
 
@@ -173,8 +174,9 @@ typedef compat_elf_greg_t   
compat_elf_gregset_t[COMPAT_ELF_NGREG];
 
 /* AArch32 EABI. */
 #define EF_ARM_EABI_MASK   0xff00
-#define compat_elf_check_arch(x)   (((x)->e_machine == EM_ARM) && \
-((x)->e_flags & EF_ARM_EABI_MASK))
+#define compat_elf_check_arch(x)   (system_supports_aarch32_el0()  \
+   && ((x)->e_machine == EM_ARM)   \
+   && ((x)->e_flags & EF_ARM_EABI_MASK))
 
 #define compat_start_threadcompat_start_thread
 #define COMPAT_SET_PERSONALITY(ex) set_thread_flag(TIF_32BIT);
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 75d5a86..4a6ae31 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -36,6 +36,7 @@
 DEFINE_PER_CPU(struct cpuinfo_arm64, cpu_data);
 static struct cpuinfo_arm64 boot_cpu_data;
 static bool mixed_endian_el0 = true;
+static bool aarch32_el0 = true;
 
 static char *icache_policy_str[] = {
[ICACHE_POLICY_RESERVED] = "RESERVED/UNKNOWN",
@@ -79,13 +80,24 @@ bool system_supports_mixed_endian_el0(void)
return mixed_endian_el0;
 }
 
+bool system_supports_aarch32_el0(void)
+{
+   return aarch32_el0;
+}
+
 static void update_mixed_endian_el0_support(struct cpuinfo_arm64 *info)
 {
mixed_endian_el0 &= 
id_aa64mmfr0_mixed_endian_el0(info->reg_id_aa64mmfr0);
 }
 
+static void update_aarch32_el0_support(struct cpuinfo_arm64 *info)
+{
+   aarch32_el0 &= id_aa64pfr0_aarch32_el0(info->reg_id_aa64pfr0);
+}
+
 static void update_cpu_features(struct cpuinfo_arm64 *info)
 {
+   update_aarch32_el0_support(info);
update_mixed_endian_el0_support(info);
 }
 
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 1/2] arm64: cpufeature.h: resolve hidden header dependencies

2015-09-02 Thread Yury Norov
Functions implemented in cpufeature.h depend on some headers, but
cpufeature.h does not include them. This may cause build failure if
cpufeature.h user does not include that headers by itself. (Like it
happens in next patch of this series.)

Signed-off-by: Yury Norov 
---
 arch/arm64/include/asm/cpufeature.h | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cpufeature.h 
b/arch/arm64/include/asm/cpufeature.h
index c104421..f0e4017 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -19,7 +19,6 @@
  */
 
 #define MAX_CPU_FEATURES   (8 * sizeof(elf_hwcap))
-#define cpu_feature(x) ilog2(HWCAP_ ## x)
 
 #define ARM64_WORKAROUND_CLEAN_CACHE   0
 #define ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE   1
@@ -30,6 +29,14 @@
 
 #ifndef __ASSEMBLY__
 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define cpu_feature(x) ilog2(HWCAP_ ## x)
+
 struct arm64_cpu_capabilities {
const char *desc;
u16 capability;
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 2/2] arm64: don't load 32-bit binaries if platform has no aarch32_el0

2015-09-02 Thread Yury Norov
On Wed, Sep 02, 2015 at 04:15:52PM +0200, Ard Biesheuvel wrote:
> On 2 September 2015 at 16:00, Yury Norov  wrote:
> > Kernel option COMPAT defines the ability of executing aarch32 binaries.
> > Some platforms does not support aarch32 mode, and so cannot execute that
> > binaries. But we cannot just disable COMPAT for them because the same
> > kernel binary may be used by multiple platforms.
> >
> > In this patch, system_supports_aarch32_el0() is introduced to detect
> > aarch32 support at run-time.
> >
> > Signed-off-by: Yury Norov 
> > ---
> >  arch/arm64/include/asm/cpufeature.h |  1 +
> >  arch/arm64/include/asm/cputype.h|  8 
> >  arch/arm64/include/asm/elf.h|  6 --
> >  arch/arm64/kernel/cpuinfo.c | 12 
> >  4 files changed, 25 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/cpufeature.h 
> > b/arch/arm64/include/asm/cpufeature.h
> > index f0e4017..35f2654 100644
> > --- a/arch/arm64/include/asm/cpufeature.h
> > +++ b/arch/arm64/include/asm/cpufeature.h
> > @@ -83,6 +83,7 @@ void check_local_cpu_errata(void);
> >  void check_local_cpu_features(void);
> >  bool cpu_supports_mixed_endian_el0(void);
> >  bool system_supports_mixed_endian_el0(void);
> > +bool system_supports_aarch32_el0(void);
> >
> >  #endif /* __ASSEMBLY__ */
> >
> > diff --git a/arch/arm64/include/asm/cputype.h 
> > b/arch/arm64/include/asm/cputype.h
> > index a84ec60..a24e42c 100644
> > --- a/arch/arm64/include/asm/cputype.h
> > +++ b/arch/arm64/include/asm/cputype.h
> > @@ -81,6 +81,9 @@
> >  #define ID_AA64MMFR0_BIGEND(mmfr0) \
> > (((mmfr0) & ID_AA64MMFR0_BIGEND_MASK) >> ID_AA64MMFR0_BIGEND_SHIFT)
> >
> > +#define ID_AA64PFR0_EL0_64 1
> > +#define ID_AA64PFR0_EL0_6432   2
> > +
> >  #define SCTLR_EL1_CP15BEN  (0x1 << 5)
> >  #define SCTLR_EL1_SED  (0x1 << 8)
> >
> > @@ -116,6 +119,11 @@ static inline u32 __attribute_const__ 
> > read_cpuid_cachetype(void)
> > return read_cpuid(CTR_EL0);
> >  }
> >
> > +static inline bool id_aa64pfr0_aarch32_el0(u64 pfr0)
> > +{
> > +   return pfr0 == ID_AA64PFR0_EL0_6432;
> > +}
> > +
> 
> Shouldn't you be masking some bits in pfr0 before doing the comparison?
> ID_AA64PFR0_EL1.EL0 occupies bits [3:0] only.
> 

Yes, it was in v1 but I lost in v2.

> >  static inline bool id_aa64mmfr0_mixed_endian_el0(u64 mmfr0)
> >  {
> > return (ID_AA64MMFR0_BIGEND(mmfr0) == 0x1) ||
> > diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
> > index faad6df..461897b 100644
> > --- a/arch/arm64/include/asm/elf.h
> > +++ b/arch/arm64/include/asm/elf.h
> > @@ -21,6 +21,7 @@
> >  /*
> >   * ELF register definitions..
> >   */
> > +#include 
> >  #include 
> >  #include 
> >
> > @@ -173,8 +174,9 @@ typedef compat_elf_greg_t   
> > compat_elf_gregset_t[COMPAT_ELF_NGREG];
> >
> >  /* AArch32 EABI. */
> >  #define EF_ARM_EABI_MASK   0xff00
> > -#define compat_elf_check_arch(x)   (((x)->e_machine == EM_ARM) && \
> > -((x)->e_flags & EF_ARM_EABI_MASK))
> > +#define compat_elf_check_arch(x)   (system_supports_aarch32_el0()  \
> > +   && ((x)->e_machine == EM_ARM)   \
> > +   && ((x)->e_flags & 
> > EF_ARM_EABI_MASK))
> >
> >  #define compat_start_threadcompat_start_thread
> >  #define COMPAT_SET_PERSONALITY(ex) set_thread_flag(TIF_32BIT);
> > diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
> > index 75d5a86..4a6ae31 100644
> > --- a/arch/arm64/kernel/cpuinfo.c
> > +++ b/arch/arm64/kernel/cpuinfo.c
> > @@ -36,6 +36,7 @@
> >  DEFINE_PER_CPU(struct cpuinfo_arm64, cpu_data);
> >  static struct cpuinfo_arm64 boot_cpu_data;
> >  static bool mixed_endian_el0 = true;
> > +static bool aarch32_el0 = true;
> >
> >  static char *icache_policy_str[] = {
> > [ICACHE_POLICY_RESERVED] = "RESERVED/UNKNOWN",
> > @@ -79,13 +80,24 @@ bool system_supports_mixed_endian_el0(void)
> > return mixed_endian_el0;
> >  }
> >
> > +bool system_supports_aarch32_el0(void)
> > +{
> > +   return aarch32_el0;
> > +}
> > +
> >  static void update_mixed_endian_el0_support(struct cpuinfo_arm64 *info)
> >  {
> > mixed_endian_el0 &= 
> > id_aa64mmfr0_mixed_endian_el0(info->reg_id_aa64mmfr0);
> >  }
> >
> > +static void update_aarch32_el0_support(struct cpuinfo_arm64 *info)
> > +{
> > +   aarch32_el0 &= id_aa64pfr0_aarch32_el0(info->reg_id_aa64pfr0);
> > +}
> > +
> >  static void update_cpu_features(struct cpuinfo_arm64 *info)
> >  {
> > +   update_aarch32_el0_support(info);
> > update_mixed_endian_el0_support(info);
> >  }
> >
> > --
> > 2.1.4
> >
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 1/2] arm64: cpufeature.h: resolve hidden header dependencies

2015-09-02 Thread Yury Norov
Functions implemented in cpufeature.h depend on some headers, but
cpufeature.h does not include them. This may cause build failure if
cpufeature.h user does not include that headers by itself. (Like it
happens in next patch of this series.)

cpu_feature macro is moved under "#ifndef __ASSEMBLY__" guard as
it depends on , and can be used in C files only.

Signed-off-by: Yury Norov 
Reviewed-by: Mark Rutland 
---
 arch/arm64/include/asm/cpufeature.h | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cpufeature.h 
b/arch/arm64/include/asm/cpufeature.h
index c104421..f0e4017 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -19,7 +19,6 @@
  */
 
 #define MAX_CPU_FEATURES   (8 * sizeof(elf_hwcap))
-#define cpu_feature(x) ilog2(HWCAP_ ## x)
 
 #define ARM64_WORKAROUND_CLEAN_CACHE   0
 #define ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE   1
@@ -30,6 +29,14 @@
 
 #ifndef __ASSEMBLY__
 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define cpu_feature(x) ilog2(HWCAP_ ## x)
+
 struct arm64_cpu_capabilities {
const char *desc;
u16 capability;
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 0/2] arm64: introduce run-time detection of aarch32 support

2015-09-02 Thread Yury Norov
This is needed to avoid loading aarch32 binaries if COMPAT is enabled, but
aarch32 is not supported by specific platform.

First patch fixes hidden header dependencies in 
arch/arm64/include/asm/cpufeature.h.
It's not related to the issue, but helps to avoid build failure that happens
if one applies second patch only.

Second patch adds run-time detection of aarch32 support, and rejects kernel to
load such binaries, if not supported.

Tested on ThunderX.

V3:
 - first patch message extended with explanation for cpu_feature macro move;
 - in second patch, restored pfr0 masking in id_aa64pfr0_aarch32_el0().

V2:
 - add missing  for the __attribute_const__ on
   cpuid_feature_extract_field;
 - move cpu_feature macro under the __ASSEMBLY__ guard.
 - check that all CPUs support AArch32, not the current only,
   the same way as for endianness support.

Signed-off-by: Yury Norov 

Yury Norov (2):
  arm64: cpufeature.h: resolve hidden header dependencies
  arm64: don't load 32-bit binaries if platform has no aarch32_el0

 arch/arm64/include/asm/cpufeature.h | 10 +-
 arch/arm64/include/asm/cputype.h|  8 
 arch/arm64/include/asm/elf.h|  6 --
 arch/arm64/kernel/cpuinfo.c | 12 
 4 files changed, 33 insertions(+), 3 deletions(-)

-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 2/2] arm64: don't load 32-bit binaries if platform has no aarch32_el0

2015-09-02 Thread Yury Norov
Kernel option COMPAT defines the ability of executing aarch32 binaries.
Some platforms does not support aarch32 mode, and so cannot execute that
binaries. But we cannot just disable COMPAT for them because the same
kernel binary may be used by multiple platforms.

In this patch, system_supports_aarch32_el0() is introduced to detect
aarch32 support at run-time.

Signed-off-by: Yury Norov 
---
 arch/arm64/include/asm/cpufeature.h |  1 +
 arch/arm64/include/asm/cputype.h|  9 +
 arch/arm64/include/asm/elf.h|  6 --
 arch/arm64/kernel/cpuinfo.c | 12 
 4 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h 
b/arch/arm64/include/asm/cpufeature.h
index f0e4017..35f2654 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -83,6 +83,7 @@ void check_local_cpu_errata(void);
 void check_local_cpu_features(void);
 bool cpu_supports_mixed_endian_el0(void);
 bool system_supports_mixed_endian_el0(void);
+bool system_supports_aarch32_el0(void);
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index a84ec60..5fb5585 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -81,6 +81,10 @@
 #define ID_AA64MMFR0_BIGEND(mmfr0) \
(((mmfr0) & ID_AA64MMFR0_BIGEND_MASK) >> ID_AA64MMFR0_BIGEND_SHIFT)
 
+#define ID_AA64PFR0_EL0_64 1
+#define ID_AA64PFR0_EL0_6432   2
+#define ID_AA64PFR0_EL0_MASK   0xf
+
 #define SCTLR_EL1_CP15BEN  (0x1 << 5)
 #define SCTLR_EL1_SED  (0x1 << 8)
 
@@ -116,6 +120,11 @@ static inline u32 __attribute_const__ 
read_cpuid_cachetype(void)
return read_cpuid(CTR_EL0);
 }
 
+static inline bool id_aa64pfr0_aarch32_el0(u64 pfr0)
+{
+   return (pfr0 & ID_AA64PFR0_EL0_MASK) == ID_AA64PFR0_EL0_6432;
+}
+
 static inline bool id_aa64mmfr0_mixed_endian_el0(u64 mmfr0)
 {
return (ID_AA64MMFR0_BIGEND(mmfr0) == 0x1) ||
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index faad6df..461897b 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -21,6 +21,7 @@
 /*
  * ELF register definitions..
  */
+#include 
 #include 
 #include 
 
@@ -173,8 +174,9 @@ typedef compat_elf_greg_t   
compat_elf_gregset_t[COMPAT_ELF_NGREG];
 
 /* AArch32 EABI. */
 #define EF_ARM_EABI_MASK   0xff00
-#define compat_elf_check_arch(x)   (((x)->e_machine == EM_ARM) && \
-((x)->e_flags & EF_ARM_EABI_MASK))
+#define compat_elf_check_arch(x)   (system_supports_aarch32_el0()  \
+   && ((x)->e_machine == EM_ARM)   \
+   && ((x)->e_flags & EF_ARM_EABI_MASK))
 
 #define compat_start_threadcompat_start_thread
 #define COMPAT_SET_PERSONALITY(ex) set_thread_flag(TIF_32BIT);
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 75d5a86..4a6ae31 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -36,6 +36,7 @@
 DEFINE_PER_CPU(struct cpuinfo_arm64, cpu_data);
 static struct cpuinfo_arm64 boot_cpu_data;
 static bool mixed_endian_el0 = true;
+static bool aarch32_el0 = true;
 
 static char *icache_policy_str[] = {
[ICACHE_POLICY_RESERVED] = "RESERVED/UNKNOWN",
@@ -79,13 +80,24 @@ bool system_supports_mixed_endian_el0(void)
return mixed_endian_el0;
 }
 
+bool system_supports_aarch32_el0(void)
+{
+   return aarch32_el0;
+}
+
 static void update_mixed_endian_el0_support(struct cpuinfo_arm64 *info)
 {
mixed_endian_el0 &= 
id_aa64mmfr0_mixed_endian_el0(info->reg_id_aa64mmfr0);
 }
 
+static void update_aarch32_el0_support(struct cpuinfo_arm64 *info)
+{
+   aarch32_el0 &= id_aa64pfr0_aarch32_el0(info->reg_id_aa64pfr0);
+}
+
 static void update_cpu_features(struct cpuinfo_arm64 *info)
 {
+   update_aarch32_el0_support(info);
update_mixed_endian_el0_support(info);
 }
 
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH] arm64: cpuinfo: reduce cache contention on update_{feature}_support

2015-09-04 Thread Yury Norov
This patch is on top of https://lkml.org/lkml/2015/9/2/413

In master, there's only a single function -
update_mixed_endian_el0_support
And similar function is on review mentioned above.

The algorithm for them is like this:
 - there's system-wide boolean marker for the feature that is
   initially enabled;
 - there's also updater for the feature that may disable it
   system-widely if feature is not supported on current CPU.
 - updater is called for each CPU on bootup.

The problem is the way updater does its work. On each CPU, it
unconditionally updates system-wide marker. For multi-core
system it makes CPU issue invalidate message for a cache
line containing marker. This invalidate increases cache
contention for nothing, because there's a single marker reset
that is really needed, and the others are useless.

If the number of system-wide markers of this sort will grow,
it may become a trouble on large-scale SOCs. The fix is trivial,
though: do system-wide marker update conditionally, and preserve
corresponding cache line in shared state for all update() calls,
except, probably, one.

Signed-off-by: Yury Norov 
---
 arch/arm64/kernel/cpuinfo.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 4a6ae31..9972c1e 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -87,12 +87,14 @@ bool system_supports_aarch32_el0(void)
 
 static void update_mixed_endian_el0_support(struct cpuinfo_arm64 *info)
 {
-   mixed_endian_el0 &= 
id_aa64mmfr0_mixed_endian_el0(info->reg_id_aa64mmfr0);
+   if (mixed_endian_el0 && 
!id_aa64mmfr0_mixed_endian_el0(info->reg_id_aa64mmfr0))
+   mixed_endian_el0 = false;
 }
 
 static void update_aarch32_el0_support(struct cpuinfo_arm64 *info)
 {
-   aarch32_el0 &= id_aa64pfr0_aarch32_el0(info->reg_id_aa64pfr0);
+   if (aarch32_el0 && !id_aa64pfr0_aarch32_el0(info->reg_id_aa64pfr0))
+   aarch32_el0 = false;
 }
 
 static void update_cpu_features(struct cpuinfo_arm64 *info)
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] arm64: cpuinfo: reduce cache contention on update_{feature}_support

2015-09-04 Thread Yury Norov
On Fri, Sep 04, 2015 at 05:40:57PM +0100, Suzuki K. Poulose wrote:
> On 04/09/15 17:04, Yury Norov wrote:
> >This patch is on top of https://lkml.org/lkml/2015/9/2/413
> >
> >In master, there's only a single function -
> > update_mixed_endian_el0_support
> >And similar function is on review mentioned above.
> >
> >The algorithm for them is like this:
> >  - there's system-wide boolean marker for the feature that is
> >initially enabled;
> >  - there's also updater for the feature that may disable it
> >system-widely if feature is not supported on current CPU.
> >  - updater is called for each CPU on bootup.
> >
> >The problem is the way updater does its work. On each CPU, it
> >unconditionally updates system-wide marker. For multi-core
> >system it makes CPU issue invalidate message for a cache
> >line containing marker. This invalidate increases cache
> >contention for nothing, because there's a single marker reset
> >that is really needed, and the others are useless.
> >
> >If the number of system-wide markers of this sort will grow,
> >it may become a trouble on large-scale SOCs. The fix is trivial,
> >though: do system-wide marker update conditionally, and preserve
> >corresponding cache line in shared state for all update() calls,
> >except, probably, one.
> 
> As I have mentioned already, this patch (and the per feature functions)
> won't be needed once we merge my series (which is waiting for the merge
> window to see the public lights)
> 

OK. Than waiting for your patchset.

BR,
Yury

> Cheers
> Suzuki
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 00/23] ILP32 for ARM64

2015-10-01 Thread Yury Norov
On Wed, Sep 30, 2015 at 11:19:19AM +0100, Catalin Marinas wrote:
> On Wed, Sep 30, 2015 at 01:13:57AM +0300, Yury Norov wrote:
> > V5 reincarnation for ILP32.
> > 
> > This is mostly the same code as Andrew suggested in v3:
> > https://lkml.org/lkml/2014/9/3/704.
> > 
> > V4 series and discussion:
> > https://lkml.org/lkml/2015/4/13/691
> > 
> > Discussion on v3 and v4 raised questions and some disagreement in community,
> > and therefore patches are not accepted till now. In this v5 I tried to 
> > avoid any
> > changes that are not about obvious fixes, so all interface and 
> > implementation
> > questions are still here.
> 
> This thing comes roughly every 5-6 months, so I don't think it's worth
> reviewing it again and forgetting about it until sometime next year. We
> also had discussions on the v4 and IIRC we agreed that the ABI should be
> closer to AArch32/compat in terms of __kernel_size_t, time_t but with
> the canonical set of system calls from the asm-generic/unistd.h.
> 
> > In v5:
> >  - rebased on top of 4.3.0-rc3;
> >  - build fixed if ILP32 enabled without AARCH32;
> >  - PATCH v4 22/24 (use compat for stack_t) dropped because it confuses
> >debug tools like gdb and strace;
> >  - PATCH v4 20/24 (use compat-syscalls for msgsnd and msgrcv for ILP32)
> >dropped as breaking tests;
> >  - PATCH v5 22/23 (msgrcv, msgsnd handlers) introduced for proper 
> >handling of msgrcv, msgsnd;
> >  - other minor fixes.
> 
> So apart from rebasing, there are no ABI changes. I don't think it's
> worth re-discussing the points raised during v4.
> 
> > Questions under discussion:
> >  - What for ILP32 on ARM64?
> > See https://lkml.org/lkml/2015/4/13/814
> > and http://permalink.gmane.org/gmane.comp.lib.uclibc.buildroot/121100
> > Briefly,
> >  - for compatibility;
> >  - for performance;
> >  - for memory saving.
> 
> Does anyone actually need this ABI? And by "need" I don't mean a
> tick-box on product fliers but actually someone going to use it on real
> systems in the field. Because I'm not keen on maintaining an ABI in the
> kernel just as a PR exercise. I have yet to see conclusive benchmarks
> that ILP32 is a real win vs LP64 but happy to be proven wrong.
> 

Adding Prasun Capoor 

I'm not familar with details. I know that ARM32 compatibility is the main
concern now. I think, in long run compatibility doesn't mean much.
The performance does instead. Bamvor Jian Zhang reports 10%
performance gain, and I think noone will miss the chance to became 10%
faster, if speed is a real concern, just after rebuilding the application.

> That said, I'm fine with agreeing on an ABI and see whether it takes off
> before any merging decisions.
> 
> >  - ABI questions: time_t and so on;
> > I think we are out of choice now. Patches to GCC and Glibc are
> > upstreamed more than a year ago, and there already might be a code 
> > compiled
> > against existing ABI. At the end, there is no major disagreement, and 
> > final
> > word is after ABI users. And I found no objections from that side.
> 
> CORRECTION: patches for gcc have been upstreamed, that's the ELF and PCS
> AArch64 ILP32 ABI. The syscall ABI which goes in glibc hasn't been
> merged because we did not reach an agreement on the kernel ABI (it would
> be rather silly to push something into mainline glibc that's not
> officially supported by Linux).
> 
> I really don't care if there is compiled code out there using out of
> tree patches for glibc and the kernel.

You right, they are out of tree. Sorry.

> 
> >  - Implementation questions: use ILP32 separated table or not, and others;
> > Code proposed by Andrew works just fine for more than a year,
> > and it even shows slightly better performance comparing to LP64:
> > http://permalink.gmane.org/gmane.comp.lib.uclibc.buildroot/121100
> > So I see no reason to change something except obvious bugs, if found.
> 
> As I said, with patches twice a year, I don't remember the past
> discussions. So normally you should start with v4 and address the
> comments there. But you seem to have refreshed v3.
> 
> Anyway, if by table you mean the syscall table, I think on v4 we agreed
> on a separate ILP32 syscall table using the generic syscall numbering
> but with some compat syscall pointers where applicable.
> 

We already have separated ILP32 syscall table, see patch 19 in this
patchset. This is in fact the option e), suggested by Arnd as best
option for him. https://lkml.org/lkml/2015/

Re: [PATCH v5 00/23] ILP32 for ARM64

2015-10-01 Thread Yury Norov
On Wed, Sep 30, 2015 at 05:41:03PM +0100, Mark Brown wrote:
> On Wed, Sep 30, 2015 at 11:19:19AM +0100, Catalin Marinas wrote:
> > On Wed, Sep 30, 2015 at 01:13:57AM +0300, Yury Norov wrote:
> 
> > >  - What for ILP32 on ARM64?
> > >   See https://lkml.org/lkml/2015/4/13/814
> > >   and http://permalink.gmane.org/gmane.comp.lib.uclibc.buildroot/121100
> > >   Briefly,
> > >- for compatibility;
> > >- for performance;
> > >- for memory saving.
> 
> > Does anyone actually need this ABI? And by "need" I don't mean a
> > tick-box on product fliers but actually someone going to use it on real
> > systems in the field. Because I'm not keen on maintaining an ABI in the
> > kernel just as a PR exercise. I have yet to see conclusive benchmarks
> > that ILP32 is a real win vs LP64 but happy to be proven wrong.
> 
> Indeed.  On that subject there was some discussion at Linaro Connect
> last week about work (being done outside Linaro, not sure how public it
> is at this point) to pull together the current state of the art into a
> Docker container image which people can use for benchmarking and as a
> reference for how to pull things together.  That should help with the
> analysis, it'll at least make it easier for other people to reproduce
> any benchmarking results.

Hi, Mark,

>From you, I got more on what happens with ILP32 than from my company.
Thank you. I know people participated Linaro Connect, and will ask
them for details. And, if possible, will share it here.

BR,
Yury.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 17/23] arm64:ilp32: add vdso-ilp32 and use for signal return

2015-10-01 Thread Yury Norov
On Tue, Sep 29, 2015 at 11:06:13PM -0500, Nathan Lynch wrote:
> On 09/29/2015 05:14 PM, Yury Norov wrote:
> > From: Philipp Tomsich 
> > 
> > Adjusted to move the move data page before code pages in sync with
> > commit 601255ae3c98fd3a8bb4696425e4f868b4f1
> 
> This commit message needs more information about how the ilp32 VDSO uses
> the existing arm64 code.  I had to really hunt through the Makefile to
> figure out what's going on.
> 
> The commit message should also identify the APIs that are supported.
> The subject line mentions signal return, but gettimeofday, clock_gettime
> and clock_getres are being added here too, and it is not obvious.
> 
> 
> > Signed-off-by: Philipp Tomsich 
> > Signed-off-by: Christoph Muellner 
> > Signed-off-by: Yury Norov 
> > 
> >  create mode 100644 arch/arm64/kernel/vdso-ilp32/.gitignore
> >  create mode 100644 arch/arm64/kernel/vdso-ilp32/Makefile
> >  copy arch/arm64/{include/asm/vdso.h => kernel/vdso-ilp32/vdso-ilp32.S} 
> > (56%)
> >  create mode 100644 arch/arm64/kernel/vdso-ilp32/vdso-ilp32.lds.S
> 
> How are you invoking git-format-patch?  The copy detection in this case
> is not conducive to review.
> 
> It looks like the existing arm64 vdso Makefile has been copied to
> vdso-ilp32/ and adjusted for paths and naming.  While the gettimeofday
> assembly implementation is reused, the build logic is duplicated.  x86
> produces VDSOs for multiple ABIs with a single Makefile; is a similar
> approach not appropriate for arm64?
> 
> 
> > diff --git a/arch/arm64/kernel/vdso-ilp32/vdso-ilp32.lds.S 
> > b/arch/arm64/kernel/vdso-ilp32/vdso-ilp32.lds.S
> > new file mode 100644
> > index 000..ac8029b
> > --- /dev/null
> > +++ b/arch/arm64/kernel/vdso-ilp32/vdso-ilp32.lds.S
> > @@ -0,0 +1,98 @@
> 
> [...]
> 
> > +#include 
> > +#include 
> > +#include 
> > +
> > +/*OUTPUT_FORMAT("elf32-littleaarch64", "elf32-bigaarch64", 
> > "elf32-littleaarch64")
> > +OUTPUT_ARCH(aarch64)
> > +*/
> 
> If these lines aren't needed then omit them.
> 
> [...]
> 
> 
> > +/*
> > + * This controls what symbols we export from the DSO.
> > + */
> > +VERSION
> > +{
> > +   LINUX_2.6.39 {
> > +   global:
> > +   __kernel_rt_sigreturn;
> > +   __kernel_gettimeofday;
> > +   __kernel_clock_gettime;
> > +   __kernel_clock_getres;
> > +   local: *;
> > +   };
> > +}
> 
> Something that came up during review of arch/arm's VDSO code: consider
> using version and names that match x86, i.e. LINUX_2.6, __vdso_gettimeofday.
> 
> http://lists.infradead.org/pipermail/linux-arm-kernel/2014-June/267940.html
> 
> Using LINUX_2.6.39 for this code is nonsensical.
> 
> 
> > diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
> > index b239b9b..bed6cf1 100644
> > --- a/arch/arm64/kernel/vdso.c
> > +++ b/arch/arm64/kernel/vdso.c
> > @@ -40,6 +40,12 @@ extern char vdso_start, vdso_end;
> >  static unsigned long vdso_pages;
> >  static struct page **vdso_pagelist;
> >  
> > +#ifdef CONFIG_ARM64_ILP32
> > +extern char vdso_ilp32_start, vdso_ilp32_end;
> > +static unsigned long vdso_ilp32_pages;
> > +static struct page **vdso_ilp32_pagelist;
> > +#endif
> > +
> >  /*
> >   * The vDSO data page.
> >   */
> > @@ -117,24 +123,29 @@ int aarch32_setup_vectors_page(struct linux_binprm 
> > *bprm, int uses_interp)
> >  }
> >  #endif /* CONFIG_AARCH32_EL0 */
> >  
> > -static struct vm_special_mapping vdso_spec[2];
> > -
> > -static int __init vdso_init(void)
> > +static inline int __init vdso_init_common(char *vdso_start, char *vdso_end,
> 
> No inline please.
> 
> 
> > + unsigned long *vdso_pagesp,
> > + struct page ***vdso_pagelistp,
> > + struct vm_special_mapping* vdso_spec)
> >  {
> 
> [...]
> 
> >  int arch_setup_additional_pages(struct linux_binprm *bprm,
> > int uses_interp)
> >  {
> > struct mm_struct *mm = current->mm;
> > unsigned long vdso_base, vdso_text_len, vdso_mapping_len;
> > -   void *ret;
> > +   void* ret;
> 
> Gratuitous (and incorrect) style change.
> 
> 
> > +   unsigned long pages = vdso_pages;
> > +   struct vm_special_mapping* spec = vdso_spec;
> 
> Incorrect style:  *spec

Hi Nathan,

If Philipp Philipp Tomsich will not answer soon, I'll fix all this.

BR,
Yury.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] perf: drop unneeded bitmap_zero() in util/header.c

2018-06-23 Thread Yury Norov
On top of next-20180622.

bitmap_zero() is called after bitmap_alloc() in perf code. But
bitmap_alloc() internally uses calloc() which guarantees that allocated
area is zeroed. So following bitmap_zero is unneeded. Drop it.

This happened because of confusing name for bitmap allocator. It
should has name bitmap_zalloc instead of bitmap_alloc. This series:
https://lkml.org/lkml/2018/6/18/841
introduces new API for bitmap allocations in kernel, and functions
there are named correctly. Following patch propogates the API to tools,
and fixes naming issue.

Signed-off-by: Yury Norov 
---
 tools/perf/tests/bitmap.c   | 2 --
 tools/perf/tests/mem2node.c | 5 +
 tools/perf/util/header.c| 3 ---
 3 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/tools/perf/tests/bitmap.c b/tools/perf/tests/bitmap.c
index 47bedf25ba69..96e7fc1ad3f9 100644
--- a/tools/perf/tests/bitmap.c
+++ b/tools/perf/tests/bitmap.c
@@ -16,8 +16,6 @@ static unsigned long *get_bitmap(const char *str, int nbits)
bm = bitmap_alloc(nbits);
 
if (map && bm) {
-   bitmap_zero(bm, nbits);
-
for (i = 0; i < map->nr; i++)
set_bit(map->map[i], bm);
}
diff --git a/tools/perf/tests/mem2node.c b/tools/perf/tests/mem2node.c
index 0c3c87f86e03..d8e3d49d3638 100644
--- a/tools/perf/tests/mem2node.c
+++ b/tools/perf/tests/mem2node.c
@@ -24,11 +24,8 @@ static unsigned long *get_bitmap(const char *str, int nbits)
bm = bitmap_alloc(nbits);
 
if (map && bm) {
-   bitmap_zero(bm, nbits);
-
-   for (i = 0; i < map->nr; i++) {
+   for (i = 0; i < map->nr; i++)
set_bit(map->map[i], bm);
-   }
}
 
if (map)
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 540cd2dcd3e7..3a6bec22baa3 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -279,8 +279,6 @@ static int do_read_bitmap(struct feat_fd *ff, unsigned long 
**pset, u64 *psize)
if (!set)
return -ENOMEM;
 
-   bitmap_zero(set, size);
-
p = (u64 *) set;
 
for (i = 0; (u64) i < BITS_TO_U64(size); i++) {
@@ -1285,7 +1283,6 @@ static int memory_node__read(struct memory_node *n, 
unsigned long idx)
return -ENOMEM;
}
 
-   bitmap_zero(n->set, size);
n->node = idx;
n->size = size;
 
-- 
2.17.1



[PATCH 2/2] bitmap: sync tools with new bitmap allocation API

2018-06-23 Thread Yury Norov
On top of next-20180622 and Andy Shevchenko series:
https://lkml.org/lkml/2018/6/18/841

The series mentioned above introduces helpers for bitmap allocation.
tools/ has its own bitmap_alloc() which differs from bitmap_alloc()
proposed in new kernel API, and is equivalent to bitmap_zalloc().
In this series tools is switched to new API.

This is RFC because I didn't find counterpart free() call to some
bitmap_zalloc()'s. So I didn't convert them to bitmap_free(). Could
someone point me out? The functions are:
setup_nodes();
do_read_bitmap(); // Free is called, but only in fail path.
memory_node__read();

Signed-off-by: Yury Norov 
---
 tools/include/linux/bitmap.h | 19 +++
 tools/perf/builtin-c2c.c | 10 +-
 tools/perf/tests/bitmap.c|  4 ++--
 tools/perf/tests/mem2node.c  |  4 ++--
 tools/perf/util/header.c |  6 +++---
 5 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index 48c208437bbd..b9b85b94c937 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -98,12 +98,23 @@ static inline int test_and_set_bit(int nr, unsigned long 
*addr)
 }
 
 /**
- * bitmap_alloc - Allocate bitmap
- * @nbits: Number of bits
+ * Allocation and deallocation of bitmap.
  */
-static inline unsigned long *bitmap_alloc(int nbits)
+static inline unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags)
 {
-   return calloc(1, BITS_TO_LONGS(nbits) * sizeof(unsigned long));
+   (void) flags;
+   return malloc(BITS_TO_LONGS(nbits) * sizeof(unsigned long));
+}
+
+static inline unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags)
+{
+   (void) flags;
+   return calloc(BITS_TO_LONGS(nbits), sizeof(unsigned long));
+}
+
+static inline void bitmap_free(const unsigned long *bitmap)
+{
+   free((unsigned long *)bitmap);
 }
 
 /*
diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index 6a8738f7ead3..d747e98bc37d 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -127,11 +127,11 @@ static void *c2c_he_zalloc(size_t size)
if (!c2c_he)
return NULL;
 
-   c2c_he->cpuset = bitmap_alloc(c2c.cpus_cnt);
+   c2c_he->cpuset = bitmap_zalloc(c2c.cpus_cnt, 0);
if (!c2c_he->cpuset)
return NULL;
 
-   c2c_he->nodeset = bitmap_alloc(c2c.nodes_cnt);
+   c2c_he->nodeset = bitmap_zalloc(c2c.nodes_cnt, 0);
if (!c2c_he->nodeset)
return NULL;
 
@@ -156,8 +156,8 @@ static void c2c_he_free(void *he)
free(c2c_he->hists);
}
 
-   free(c2c_he->cpuset);
-   free(c2c_he->nodeset);
+   bitmap_free(c2c_he->cpuset);
+   bitmap_free(c2c_he->nodeset);
free(c2c_he->nodestr);
free(c2c_he->node_stats);
free(c2c_he);
@@ -2051,7 +2051,7 @@ static int setup_nodes(struct perf_session *session)
struct cpu_map *map = n[node].map;
unsigned long *set;
 
-   set = bitmap_alloc(c2c.cpus_cnt);
+   set = bitmap_zalloc(c2c.cpus_cnt, 0);
if (!set)
return -ENOMEM;
 
diff --git a/tools/perf/tests/bitmap.c b/tools/perf/tests/bitmap.c
index 96e7fc1ad3f9..a35d44ad54bc 100644
--- a/tools/perf/tests/bitmap.c
+++ b/tools/perf/tests/bitmap.c
@@ -13,7 +13,7 @@ static unsigned long *get_bitmap(const char *str, int nbits)
unsigned long *bm = NULL;
int i;
 
-   bm = bitmap_alloc(nbits);
+   bm = bitmap_zalloc(nbits, 0);
 
if (map && bm) {
for (i = 0; i < map->nr; i++)
@@ -35,7 +35,7 @@ static int test_bitmap(const char *str)
pr_debug("bitmap: %s\n", buf);
 
ret = !strcmp(buf, str);
-   free(bm);
+   bitmap_free(bm);
return ret;
 }
 
diff --git a/tools/perf/tests/mem2node.c b/tools/perf/tests/mem2node.c
index d8e3d49d3638..81a9b05dc632 100644
--- a/tools/perf/tests/mem2node.c
+++ b/tools/perf/tests/mem2node.c
@@ -21,7 +21,7 @@ static unsigned long *get_bitmap(const char *str, int nbits)
unsigned long *bm = NULL;
int i;
 
-   bm = bitmap_alloc(nbits);
+   bm = bitmap_zalloc(nbits, 0);
 
if (map && bm) {
for (i = 0; i < map->nr; i++)
@@ -65,7 +65,7 @@ int test__mem2node(struct test *t __maybe_unused, int subtest 
__maybe_unused)
T("failed: mem2node__node", -1 == mem2node__node(&map, 0x1050));
 
for (i = 0; i < ARRAY_SIZE(nodes); i++)
-   free(nodes[i].set);
+   bitmap_free(nodes[i].set);
 
mem2node__exit(&map);
return 0;
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 3a6bec22baa3..201c91db95df 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -275,7 +275,7 @@ static int do_read_bitmap(struct feat_fd *ff, unsigned long 

Re: [PATCH 2/2] bitmap: sync tools with new bitmap allocation API

2018-06-24 Thread Yury Norov
On Sun, Jun 24, 2018 at 02:31:03PM -0700, Dmitry Torokhov wrote:
> External Email
> 
> On Sat, Jun 23, 2018 at 10:35:02AM +0300, Yury Norov wrote:
> > On top of next-20180622 and Andy Shevchenko series:
> > https://lkml.org/lkml/2018/6/18/841
> >
> > The series mentioned above introduces helpers for bitmap allocation.
> > tools/ has its own bitmap_alloc() which differs from bitmap_alloc()
> > proposed in new kernel API, and is equivalent to bitmap_zalloc().
> > In this series tools is switched to new API.
> >
> > This is RFC because I didn't find counterpart free() call to some
> > bitmap_zalloc()'s. So I didn't convert them to bitmap_free(). Could
> > someone point me out? The functions are:
> > setup_nodes();
> > do_read_bitmap(); // Free is called, but only in fail path.
> 
> Yes, because if we succeed we effectively return allocated bitmap to the
> caller. You'd need to trace upwards and see how it all gets cleaned up.
> But given that this is userspace and is not expected to be long-lived,
> maybe nobody bothered freeing memory and we instead rely on the kernel
> to clean it all up when process terminates.
> 
> Thanks.
> 
> > memory_node__read();
> >
> > Signed-off-by: Yury Norov 
> > ---
> >  tools/include/linux/bitmap.h | 19 +++
> >  tools/perf/builtin-c2c.c | 10 +-
> >  tools/perf/tests/bitmap.c|  4 ++--
> >  tools/perf/tests/mem2node.c  |  4 ++--
> >  tools/perf/util/header.c |  6 +++---
> >  5 files changed, 27 insertions(+), 16 deletions(-)
> >
> > diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
> > index 48c208437bbd..b9b85b94c937 100644
> > --- a/tools/include/linux/bitmap.h
> > +++ b/tools/include/linux/bitmap.h
> > @@ -98,12 +98,23 @@ static inline int test_and_set_bit(int nr, unsigned 
> > long *addr)
> >  }
> >
> >  /**
> > - * bitmap_alloc - Allocate bitmap
> > - * @nbits: Number of bits
> > + * Allocation and deallocation of bitmap.
> >   */
> > -static inline unsigned long *bitmap_alloc(int nbits)
> > +static inline unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags)
> 
> This makes absolutely no sense for userspace API. What gfp_t even means
> here?
> 
> If you want to introduce bitmap_zalloc and bitmap_free it is fine but
> adding dummy parameters to match kernel API exactly is a folly.

Identical API makes easier porting the code from kernel to tools.
Refer for example declaration of kmalloc in:
tools/testing/radix-tree/linux.c
tools/testing/scatterlist/linux/mm.h
tools/virtio/linux/kernel.h
tools/virtio/ringtest/ptr_ring.c

Yury


Re: [PATCH] linux/bitmap.h: fix BITMAP_LAST_WORD_MASK

2018-07-26 Thread Yury Norov
On Thu, Jul 26, 2018 at 04:07:51PM +0800, Wei Wang wrote:
> The existing BITMAP_LAST_WORD_MASK macro returns 0x if nbits is
> 0. This patch changes the macro to return 0 when there is no bit needs to
> be masked.

I think this is intentional behavour. Previous version did return ~0UL
explicitly in this case. See patch 89c1e79eb3023 (linux/bitmap.h: improve
BITMAP_{LAST,FIRST}_WORD_MASK) from Rasmus.

Introducing conditional branch would affect performance. All existing
code checks nbits for 0 before handling last word where needed
explicitly. So I think we'd better change nothing here.

Yury

> Signed-off-by: Wei Wang 
> Cc: Andrew Morton 
> Cc: Rasmus Villemoes 
> Cc: Yury Norov 
> ---
>  include/linux/bitmap.h | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
> index 1ee46f4..12af3d7 100644
> --- a/include/linux/bitmap.h
> +++ b/include/linux/bitmap.h
> @@ -194,7 +194,10 @@ extern int bitmap_print_to_pagebuf(bool list, char *buf,
>const unsigned long *maskp, int nmaskbits);
> 
>  #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 
> 1)))
> -#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 
> 1)))
> +#define BITMAP_LAST_WORD_MASK(nbits)   \
> +(  \
> +   nbits ? (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) : 0  \
> +)
> 
>  #define small_const_nbits(nbits) \
> (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
> --
> 2.7.4


Re: [PATCH] linux/bitmap.h: fix BITMAP_LAST_WORD_MASK

2018-07-26 Thread Yury Norov
On Thu, Jul 26, 2018 at 06:15:59PM +0800, Wei Wang wrote:
> External Email
> 
> On 07/26/2018 05:37 PM, Yury Norov wrote:
> > On Thu, Jul 26, 2018 at 04:07:51PM +0800, Wei Wang wrote:
> > > The existing BITMAP_LAST_WORD_MASK macro returns 0x if nbits is
> > > 0. This patch changes the macro to return 0 when there is no bit needs to
> > > be masked.
> > I think this is intentional behavour. Previous version did return ~0UL
> > explicitly in this case. See patch 89c1e79eb3023 (linux/bitmap.h: improve
> > BITMAP_{LAST,FIRST}_WORD_MASK) from Rasmus.
> 
> Yes, I saw that. But it seems confusing for the corner case that nbits=0
> (no bits to mask), the macro returns with all the bits set.
> 
> 
> > 
> > Introducing conditional branch would affect performance. All existing
> > code checks nbits for 0 before handling last word where needed
> > explicitly. So I think we'd better change nothing here.
> 
> I think that didn't save the conditional branch essentially, because
> it's just moved from inside this macro to the caller as you mentioned.
> If callers missed the check for some reason and passed 0 to the macro,
> they will get something unexpected.
> 
> Current callers like __bitmap_weight, __bitmap_equal, and others, they have
> 
> if (bits % BITS_PER_LONG)
> w += hweight_long(bitmap[k] & BITMAP_LAST_WORD_MASK(bits));
> 
> we could remove the "if" check by "w += hweight_long(bitmap[k] &
> BITMAP_LAST_WORD_MASK(bits % BITS_PER_LONG));" the branch is the same.

But your patch doesn't remove external conditional, and it fact
introduces overhead, right? Also, in some cases it's not so trivial to
remove it. Consider __bitmap_intersects() for example.

Anyway, this patch changes the very basic API. In that case you should
check every user of the macro to be safe against the change, including
possible performance downsides.

If you find this corner case behavior of macro confusing, I think that
the better option would be introducing detailed comment to the
BITMAP_LAST_WORD_MASK(), or writing wrapper around it that handles
nbits == 0 as you expect.

Thanks,
Yury


Re: [PATCH] nohz: don't kick non-idle CPUs in tick_nohz_full_kick_cpu()

2018-07-19 Thread Yury Norov
On Mon, Jul 16, 2018 at 05:31:10PM +0200, Frederic Weisbecker wrote:
> External Email
> 
> On Thu, Jul 12, 2018 at 09:19:22PM +0300, Yury Norov wrote:
> > IIUC, tick_nohz_full_kick_cpu() is intended to wakeup idle CPUs
> > that will not be poked by scheduler because they are actually
> > nohz_full.
> 
> Not exactly. It is intended to trigger an interrupt on a nohz_full
> CPU that may be running in userspace without any tick. The irq_exit()
> code let us reprogramm the tick with the latest dependency updates.
> 
> >
> > But in fact this function kicks all CPUs listed in tick_nohz_full_mask,
> > namely:
> >  - idle CPUs;
> >  - CPUs runnung normal tasks;
> >  - CPUs running isolated tasks [1];
> >
> > For normal tasks it introduces unneeded latency, and for isolated tasks
> > it's fatal because isolation gets broken and task receives SIGKILL.
> 
> So this patch applies on Chris series right?

This patch may be applied on master. That's why I sent it to you.

> For now there is no such
> distinction between normal and isolated tasks. Any task running in a
> nohz_full CPU is considered to be isolated.
>
> > The patch below makes tick_nohz_full_kick_cpu() kicking only idle CPUs.
> > Non-idle nohz_full CPUs will observe changed system settings just like
> > non-idle normal (i.e. not nohz_full) CPUs, at next reschedule.
> 
> That's not exactly what we want. In fact when a task runs in a nohz_full CPU,
> it may not meet any reschedule interrupt for a long while. This is why we have
> tick_nohz_full_kick_cpu() in order to force a nohz_full CPU to see the latest
> changes.

OK, got it.

So if my understanding correct, there is 'soft isolation' which is
nohz_full, and 'hard isolation' which is Chris' task_isonation feature. For
soft isolation, the desirable behavior is to receive interrupts generated 
by tick_nohz_full_kick_cpu(), and for hard isolation it's obviously not
desirable because it kills application. 

The patch below adds check against task isolation in tick_nohz_full_kick_cpu().
It is on top of Chris' series. Is it OK from nohz point of view?

---

While here. I just wonder, on my system IRQs are sent to nohz_full CPUs
at every incoming ssh connection. The trace is like this:
[  206.835533] Call trace:
[  206.848411] [] dump_stack+0x84/0xa8
[  206.853455] [] _task_isolation_remote+0x130/0x140
[  206.859714] [] irq_work_queue_on+0xcc/0xfc
[  206.865365] [] tick_nohz_full_kick_cpu+0x88/0x94
[  206.871536] [] tick_nohz_dep_set_all+0x78/0xa8
[  206.877533] [] tick_nohz_dep_set_signal+0x28/0x34
[  206.883792] [] set_process_cpu_timer+0xd0/0x128
[  206.889876] [] update_rlimit_cpu+0x58/0x7c
[  206.895528] [] selinux_bprm_committing_creds+0x180/0x1fc
[  206.902394] [] security_bprm_committing_creds+0x40/0x5c
[  206.909173] [] install_exec_creds+0x20/0x6c
[  206.914911] [] load_elf_binary+0x368/0xbb8
[  206.920561] [] search_binary_handler+0xb8/0x224
[  206.926645] [] do_execveat_common+0x44c/0x5f0
[  206.932555] [] do_execve+0x38/0x44
[  206.937510] [] SyS_execve+0x34/0x44

I suspect that scp, ssh tunneling and similar network activities will source 
ticks on nohz_full CPUs as well. On high-loaded server it may generate
significant interrupt traffic on nohz_full CPUs. Is it desirable behavior?

---
Yury

>From 9be3c9996c06319a8070ac182291d08acfdc588d Mon Sep 17 00:00:00 2001
From: Yury Norov 
Date: Tue, 17 Jul 2018 12:40:49 +0300
Subject: [PATCH] task_isolation: don't kick isolated CPUs with
 tick_nohz_full_kick_cpu()
To: Chris Metcalf , 
Frederic Weisbecker 
Cc: Ingo Molnar ,
Thomas Gleixner ,
"Goutham, Sunil" ,
linux-kernel@vger.kernel.org

On top of Chris Metcalf series:
https://lkml.org/lkml/2017/11/3/589

tick_nohz_full_kick_cpu() currently interrupts CPUs that may run isolated
task. It's not desirable because that kick will kill isolated application.

The patch below adds check against task isolation in
tick_nohz_full_kick_cpu() to prevent breaking the isolation.

Signed-off-by: Yury Norov 
---
 include/linux/isolation.h | 7 +++
 kernel/isolation.c| 6 --
 kernel/time/tick-sched.c  | 5 +++--
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/include/linux/isolation.h b/include/linux/isolation.h
index b7f0a9085b13..fad606cdcd5e 100644
--- a/include/linux/isolation.h
+++ b/include/linux/isolation.h
@@ -158,6 +158,12 @@ static inline void task_isolation_user_exit(void)
 #endif
 }
 
+static inline bool is_isolation_cpu(int cpu)
+{
+   return task_isolation_map != NULL &&
+   cpumask_test_cpu(cpu, task_isolation_map);
+}
+
 #else /* !CONFIG_TASK_ISOLATION */
 static inline int task_isolation_request(unsigned int flags) { return -EINVAL; 
}
 static inline void task_isolation_start(void) { }
@@ -1

Re: [PATCH 1/2] perf: drop unneeded bitmap_zero() in util/header.c

2018-07-24 Thread Yury Norov
On Sat, Jun 23, 2018 at 10:35:01AM +0300, Yury Norov wrote:
> On top of next-20180622.
> 
> bitmap_zero() is called after bitmap_alloc() in perf code. But
> bitmap_alloc() internally uses calloc() which guarantees that allocated
> area is zeroed. So following bitmap_zero is unneeded. Drop it.
> 
> This happened because of confusing name for bitmap allocator. It
> should has name bitmap_zalloc instead of bitmap_alloc. This series:
> https://lkml.org/lkml/2018/6/18/841
> introduces new API for bitmap allocations in kernel, and functions
> there are named correctly. Following patch propogates the API to tools,
> and fixes naming issue.
> 
> Signed-off-by: Yury Norov 

Ping?

> ---
>  tools/perf/tests/bitmap.c   | 2 --
>  tools/perf/tests/mem2node.c | 5 +
>  tools/perf/util/header.c| 3 ---
>  3 files changed, 1 insertion(+), 9 deletions(-)
> 
> diff --git a/tools/perf/tests/bitmap.c b/tools/perf/tests/bitmap.c
> index 47bedf25ba69..96e7fc1ad3f9 100644
> --- a/tools/perf/tests/bitmap.c
> +++ b/tools/perf/tests/bitmap.c
> @@ -16,8 +16,6 @@ static unsigned long *get_bitmap(const char *str, int nbits)
>   bm = bitmap_alloc(nbits);
>  
>   if (map && bm) {
> - bitmap_zero(bm, nbits);
> -
>   for (i = 0; i < map->nr; i++)
>   set_bit(map->map[i], bm);
>   }
> diff --git a/tools/perf/tests/mem2node.c b/tools/perf/tests/mem2node.c
> index 0c3c87f86e03..d8e3d49d3638 100644
> --- a/tools/perf/tests/mem2node.c
> +++ b/tools/perf/tests/mem2node.c
> @@ -24,11 +24,8 @@ static unsigned long *get_bitmap(const char *str, int 
> nbits)
>   bm = bitmap_alloc(nbits);
>  
>   if (map && bm) {
> - bitmap_zero(bm, nbits);
> -
> - for (i = 0; i < map->nr; i++) {
> + for (i = 0; i < map->nr; i++)
>   set_bit(map->map[i], bm);
> - }
>   }
>  
>   if (map)
> diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
> index 540cd2dcd3e7..3a6bec22baa3 100644
> --- a/tools/perf/util/header.c
> +++ b/tools/perf/util/header.c
> @@ -279,8 +279,6 @@ static int do_read_bitmap(struct feat_fd *ff, unsigned 
> long **pset, u64 *psize)
>   if (!set)
>   return -ENOMEM;
>  
> - bitmap_zero(set, size);
> -
>   p = (u64 *) set;
>  
>   for (i = 0; (u64) i < BITS_TO_U64(size); i++) {
> @@ -1285,7 +1283,6 @@ static int memory_node__read(struct memory_node *n, 
> unsigned long idx)
>   return -ENOMEM;
>   }
>  
> - bitmap_zero(n->set, size);
>   n->node = idx;
>   n->size = size;
>  
> -- 
> 2.17.1


[PATCH] nohz: don't kick non-idle CPUs in tick_nohz_full_kick_cpu()

2018-07-12 Thread Yury Norov
IIUC, tick_nohz_full_kick_cpu() is intended to wakeup idle CPUs
that will not be poked by scheduler because they are actually
nohz_full.

But in fact this function kicks all CPUs listed in tick_nohz_full_mask,
namely:
 - idle CPUs;
 - CPUs runnung normal tasks;
 - CPUs running isolated tasks [1];

For normal tasks it introduces unneeded latency, and for isolated tasks
it's fatal because isolation gets broken and task receives SIGKILL.

The patch below makes tick_nohz_full_kick_cpu() kicking only idle CPUs.
Non-idle nohz_full CPUs will observe changed system settings just like
non-idle normal (i.e. not nohz_full) CPUs, at next reschedule.

[1] https://lkml.org/lkml/2017/11/3/589

Signed-off-by: Yury Norov 
---
 kernel/time/tick-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c026145eba2f..1c24c700e75a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -247,7 +247,7 @@ static void tick_nohz_full_kick(void)
  */
 void tick_nohz_full_kick_cpu(int cpu)
 {
-   if (!tick_nohz_full_cpu(cpu))
+   if (!(tick_nohz_full_cpu(cpu) && idle_cpu(cpu)))
return;
 
irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
-- 
2.17.1



[PATCH RFC 0/3] API for 128-bit IO access

2018-01-24 Thread Yury Norov
x23, #0xdead, lsl #16
  64:   aa1603e2mov x2, x22
  68:   aa1703e1mov x1, x23
  6c:   aa1503e0mov x0, x21
  70:   9400bl  0 
  74:   aa1403e2mov x2, x20
  78:   aa1303e1mov x1, x19
  7c:   ca160294eor x20, x20, x22
  80:   ca170273eor x19, x19, x23
  84:   aa1503e0mov x0, x21
  88:   aa140273orr x19, x19, x20
  8c:   9400bl  0 
  90:   f9401bf7ldr x23, [sp, #48]
  94:   f100027fcmp x19, #0x0
  98:   a94153f3ldp x19, x20, [sp, #16]
  9c:   1a9f07e0csetw0, ne  // ne = any
  a0:   a9425bf5ldp x21, x22, [sp, #32]
  a4:   a8c57bfdldp x29, x30, [sp], #80
  a8:   d65f03c0ret

I tested LE kernel with this, and it works OK for me. BE version adds
few extra instructions to swap bytes, but generated code looks reasonable. 
We can avoid byteswapping, if not needed, by using __raw_reado() and 
__raw_writeo().

Yury Norov (3):
  UAPI: Introduce 128-bit types and byteswap operations
  asm-generic/io.h: API for 128-bit I/O accessors
  arm64: enable 128-bit memory read/write support

 arch/Kconfig |   7 ++
 arch/arm64/include/asm/io.h  |  31 ++
 include/asm-generic/io.h | 147 +++
 include/linux/byteorder/generic.h|   4 +
 include/uapi/asm-generic/int-ll64.h  |   8 ++
 include/uapi/linux/byteorder/big_endian.h|   2 +
 include/uapi/linux/byteorder/little_endian.h |   4 +
 include/uapi/linux/swab.h|  22 
 include/uapi/linux/types.h   |   4 +
 9 files changed, 229 insertions(+)

-- 
2.11.0



[PATCH 1/3] UAPI: Introduce 128-bit types and byteswap operations

2018-01-24 Thread Yury Norov
Architectures like arm64 support 128-bit integer types and
operations. This patch introduces corresponding types and
__swab128() operation for be/le conversions.

They are required to implement 128-bit access to the memory,
in following patches.

Signed-off-by: Yury Norov 
---
 include/linux/byteorder/generic.h|  8 
 include/uapi/asm-generic/int-ll64.h  |  8 
 include/uapi/linux/byteorder/big_endian.h|  4 
 include/uapi/linux/byteorder/little_endian.h |  8 
 include/uapi/linux/swab.h| 22 ++
 include/uapi/linux/types.h   |  4 
 6 files changed, 54 insertions(+)

diff --git a/include/linux/byteorder/generic.h 
b/include/linux/byteorder/generic.h
index 451aaa0786ae..aa61662ee3dc 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -85,12 +85,20 @@
 
 #define cpu_to_le64 __cpu_to_le64
 #define le64_to_cpu __le64_to_cpu
+#ifdef CONFIG_HAVE_128BIT_ACCESS
+#define cpu_to_le128 __cpu_to_le128
+#define le128_to_cpu __le128_to_cpu
+#endif
 #define cpu_to_le32 __cpu_to_le32
 #define le32_to_cpu __le32_to_cpu
 #define cpu_to_le16 __cpu_to_le16
 #define le16_to_cpu __le16_to_cpu
 #define cpu_to_be64 __cpu_to_be64
 #define be64_to_cpu __be64_to_cpu
+#ifdef CONFIG_HAVE_128BIT_ACCESS
+#define cpu_to_be128 __cpu_to_be128
+#define be128_to_cpu __be128_to_cpu
+#endif
 #define cpu_to_be32 __cpu_to_be32
 #define be32_to_cpu __be32_to_cpu
 #define cpu_to_be16 __cpu_to_be16
diff --git a/include/uapi/asm-generic/int-ll64.h 
b/include/uapi/asm-generic/int-ll64.h
index 1ed06964257c..4bc2241988a9 100644
--- a/include/uapi/asm-generic/int-ll64.h
+++ b/include/uapi/asm-generic/int-ll64.h
@@ -29,9 +29,17 @@ typedef unsigned int __u32;
 #ifdef __GNUC__
 __extension__ typedef __signed__ long long __s64;
 __extension__ typedef unsigned long long __u64;
+#ifdef CONFIG_HAVE_128BIT_ACCESS
+__extension__ typedef __int128_t __s128;
+__extension__ typedef __uint128_t __u128;
+#endif
 #else
 typedef __signed__ long long __s64;
 typedef unsigned long long __u64;
+#ifdef CONFIG_HAVE_128BIT_ACCESS
+typedef __int128_t __s128;
+typedef __uint128_t __u128;
+#endif
 #endif
 
 #endif /* __ASSEMBLY__ */
diff --git a/include/uapi/linux/byteorder/big_endian.h 
b/include/uapi/linux/byteorder/big_endian.h
index 2199adc6a6c2..28a69ec10dd2 100644
--- a/include/uapi/linux/byteorder/big_endian.h
+++ b/include/uapi/linux/byteorder/big_endian.h
@@ -30,6 +30,10 @@
 #define __constant_be16_to_cpu(x) ((__force __u16)(__be16)(x))
 #define __cpu_to_le64(x) ((__force __le64)__swab64((x)))
 #define __le64_to_cpu(x) __swab64((__force __u64)(__le64)(x))
+#ifdef CONFIG_HAVE_128BIT_ACCESS
+#define __cpu_to_le128(x) ((__force __le128)__swab128((x)))
+#define __le128_to_cpu(x) __swab128((__force __u128)(__le128)(x))
+#endif
 #define __cpu_to_le32(x) ((__force __le32)__swab32((x)))
 #define __le32_to_cpu(x) __swab32((__force __u32)(__le32)(x))
 #define __cpu_to_le16(x) ((__force __le16)__swab16((x)))
diff --git a/include/uapi/linux/byteorder/little_endian.h 
b/include/uapi/linux/byteorder/little_endian.h
index 601c904fd5cd..15365bd0fe29 100644
--- a/include/uapi/linux/byteorder/little_endian.h
+++ b/include/uapi/linux/byteorder/little_endian.h
@@ -18,6 +18,10 @@
 #define __constant_ntohs(x) ___constant_swab16((__force __be16)(x))
 #define __constant_cpu_to_le64(x) ((__force __le64)(__u64)(x))
 #define __constant_le64_to_cpu(x) ((__force __u64)(__le64)(x))
+#ifdef CONFIG_HAVE_128BIT_ACCESS
+#define __constant_cpu_to_le128(x) ((__force __le128)(__u128)(x))
+#define __constant_le128_to_cpu(x) ((__force __u128)(__le128)(x))
+#endif
 #define __constant_cpu_to_le32(x) ((__force __le32)(__u32)(x))
 #define __constant_le32_to_cpu(x) ((__force __u32)(__le32)(x))
 #define __constant_cpu_to_le16(x) ((__force __le16)(__u16)(x))
@@ -30,6 +34,10 @@
 #define __constant_be16_to_cpu(x) ___constant_swab16((__force 
__u16)(__be16)(x))
 #define __cpu_to_le64(x) ((__force __le64)(__u64)(x))
 #define __le64_to_cpu(x) ((__force __u64)(__le64)(x))
+#ifdef CONFIG_HAVE_128BIT_ACCESS
+#define __cpu_to_le128(x) ((__force __le128)(__u128)(x))
+#define __le128_to_cpu(x) ((__force __u128)(__le128)(x))
+#endif
 #define __cpu_to_le32(x) ((__force __le32)(__u32)(x))
 #define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
 #define __cpu_to_le16(x) ((__force __le16)(__u16)(x))
diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h
index 23cd84868cc3..a7e97eb06a3e 100644
--- a/include/uapi/linux/swab.h
+++ b/include/uapi/linux/swab.h
@@ -75,6 +75,20 @@ static inline __attribute_const__ __u64 __fswab64(__u64 val)
 #endif
 }
 
+#ifdef CONFIG_HAVE_128BIT_ACCESS
+static inline __attribute_const__ __u128 __fswab128(__u128 val)
+{
+#if defined(__arch_swab128)
+   return __arch_swab128(val);
+#else
+   __u64 h = (__u64) (val >> 64);
+   __u64 l = (__u64) val;
+
+   return (((__u128)__fswab64(l)) << 64) | (__u128)(__fswab64(h));
+#endi

  1   2   3   4   5   6   7   8   9   10   >