Re: [PATCH] fadump: fix endianess issues in firmware assisted dump handling

2014-09-04 Thread Mahesh Jagannath Salgaonkar
On 09/03/2014 05:59 PM, Hari Bathini wrote:
> Firmware-assisted dump (fadump) kernel code is not LE compliant. The
> below patch tries to fix this issue. Tested this patch with upstream
> kernel. Did some sanity testing for the  LE fadump vmcore generated.
> Below output shows crash tool successfully opening LE fadump vmcore.
> 
>   # crash $vmlinux vmcore
> 
>   crash 7.0.5
>   Copyright (C) 2002-2014  Red Hat, Inc.
>   Copyright (C) 2004, 2005, 2006, 2010  IBM Corporation
>   Copyright (C) 1999-2006  Hewlett-Packard Co
>   Copyright (C) 2005, 2006, 2011, 2012  Fujitsu Limited
>   Copyright (C) 2006, 2007  VA Linux Systems Japan K.K.
>   Copyright (C) 2005, 2011  NEC Corporation
>   Copyright (C) 1999, 2002, 2007  Silicon Graphics, Inc.
>   Copyright (C) 1999, 2000, 2001, 2002  Mission Critical Linux, Inc.
>   This program is free software, covered by the GNU General Public 
> License,
>   and you are welcome to change it and/or distribute copies of it under
>   certain conditions.  Enter "help copying" to see the conditions.
>   This program has absolutely no warranty.  Enter "help warranty" for 
> details.
> 
>   crash: /boot/vmlinux-3.16.0-rc7-7-default+: no .gnu_debuglink section
>   GNU gdb (GDB) 7.6
>   Copyright (C) 2013 Free Software Foundation, Inc.
>   License GPLv3+: GNU GPL version 3 or later 
> 
>   This is free software: you are free to change and redistribute it.
>   There is NO WARRANTY, to the extent permitted by law.  Type "show 
> copying"
>   and "show warranty" for details.
>   This GDB was configured as "powerpc64le-unknown-linux-gnu"...
> 
> KERNEL: /boot/vmlinux-3.16.0-rc7-7-default+
>   DUMPFILE: vmcore
>   CPUS: 16
>   DATE: Sun Aug 24 14:31:28 2014
> UPTIME: 00:02:57
>   LOAD AVERAGE: 0.05, 0.08, 0.04
>  TASKS: 256
>   NODENAME: linux-dhr2
>RELEASE: 3.16.0-rc7-7-default+
>VERSION: #54 SMP Mon Aug 18 14:08:23 EDT 2014
>MACHINE: ppc64le  (4116 Mhz)
> MEMORY: 40 GB
>  PANIC: "Oops: Kernel access of bad area, sig: 11 [#1]" (check 
> log for details)
>PID: 2234
>COMMAND: "bash"
>   TASK: c009652e4a30  [THREAD_INFO: c0096777c000]
>CPU: 2
>  STATE: TASK_RUNNING (PANIC)
> 
>   crash>
> 
> Signed-off-by: Hari Bathini 

Reviewed-by: Mahesh Salgaonkar 

> ---
>  arch/powerpc/include/asm/fadump.h |   52 ---
>  arch/powerpc/kernel/fadump.c  |  112 
> +
>  arch/powerpc/platforms/pseries/lpar.c |9 ++-
>  3 files changed, 89 insertions(+), 84 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/fadump.h 
> b/arch/powerpc/include/asm/fadump.h
> index a677456..493e72f 100644
> --- a/arch/powerpc/include/asm/fadump.h
> +++ b/arch/powerpc/include/asm/fadump.h
> @@ -70,39 +70,39 @@
>  #define CPU_UNKNOWN  (~((u32)0))
> 
>  /* Utility macros */
> -#define SKIP_TO_NEXT_CPU(reg_entry)  \
> -({   \
> - while (reg_entry->reg_id != REG_ID("CPUEND"))   \
> - reg_entry++;\
> - reg_entry++;\
> +#define SKIP_TO_NEXT_CPU(reg_entry)  \
> +({   \
> + while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND"))  \
> + reg_entry++;\
> + reg_entry++;\
>  })
> 
>  /* Kernel Dump section info */
>  struct fadump_section {
> - u32 request_flag;
> - u16 source_data_type;
> - u16 error_flags;
> - u64 source_address;
> - u64 source_len;
> - u64 bytes_dumped;
> - u64 destination_address;
> + __be32  request_flag;
> + __be16  source_data_type;
> + __be16  error_flags;
> + __be64  source_address;
> + __be64  source_len;
> + __be64  bytes_dumped;
> + __be64  destination_address;
>  };
> 
>  /* ibm,configure-kernel-dump header. */
>  struct fadump_section_header {
> - u32 dump_format_version;
> - u16 dump_num_sections;
> - u16 dump_status_flag;
> - u32 offset_first_dump_section;
> + __be32  dump_format_version;
> + __be16  dump_num_sections;
> + __be16  dump_status_flag;
> + __be32  offset_first_dump_section;
> 
>   /* Fields for disk dump option. */
> - u32 dd_block_size;
> - u64 dd_block_offset;
> - u64 dd_num_blocks;
> - u32 dd_offset_disk_path;
> + __be32  dd_block_size;
> + __be64  dd_block_offset;
> + __be64  dd_num_blocks;
> + __be32  dd_offset_disk_path;
> 
>   /* Maximum t

deb-pkg: Add support for powerpc little endian

2014-09-04 Thread Michael Neuling
The Debian powerpc little endian architecture is called ppc64le.  This
is the default architecture used by Ubuntu for powerpc.

The below checks the kernel config to see if we are compiling little
endian and sets the Debian arch appropriately.

Signed-off-by: Michael Neuling 

diff --git a/scripts/package/builddeb b/scripts/package/builddeb
index 35d5a58..6f4a1af 100644
--- a/scripts/package/builddeb
+++ b/scripts/package/builddeb
@@ -37,7 +37,7 @@ create_package() {
s390*)
debarch=s390$(grep -q CONFIG_64BIT=y $KCONFIG_CONFIG && echo x 
|| true) ;;
ppc*)
-   debarch=powerpc ;;
+   debarch=$(grep -q CPU_LITTLE_ENDIAN=y $KCONFIG_CONFIG && echo 
ppc64el || echo powerpc) ;;
parisc*)
debarch=hppa ;;
mips*)

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: bit fields && data tearing

2014-09-04 Thread Paul E. McKenney
On Thu, Sep 04, 2014 at 10:47:24PM -0400, Peter Hurley wrote:
> Hi James,
> 
> On 09/04/2014 10:11 PM, James Bottomley wrote:
> > On Thu, 2014-09-04 at 17:17 -0700, Paul E. McKenney wrote:
> >> +And there are anti-guarantees:
> >> +
> >> + (*) These guarantees do not apply to bitfields, because compilers often
> >> + generate code to modify these using non-atomic read-modify-write
> >> + sequences.  Do not attempt to use bitfields to synchronize parallel
> >> + algorithms.
> >> +
> >> + (*) Even in cases where bitfields are protected by locks, all fields
> >> + in a given bitfield must be protected by one lock.  If two fields
> >> + in a given bitfield are protected by different locks, the compiler's
> >> + non-atomic read-modify-write sequences can cause an update to one
> >> + field to corrupt the value of an adjacent field.
> >> +
> >> + (*) These guarantees apply only to properly aligned and sized scalar
> >> + variables.  "Properly sized" currently means "int" and "long",
> >> + because some CPU families do not support loads and stores of
> >> + other sizes.  ("Some CPU families" is currently believed to
> >> + be only Alpha 21064.  If this is actually the case, a different
> >> + non-guarantee is likely to be formulated.)
> > 
> > This is a bit unclear.  Presumably you're talking about definiteness of
> > the outcome (as in what's seen after multiple stores to the same
> > variable).
> 
> No, the last conditions refers to adjacent byte stores from different
> cpu contexts (either interrupt or SMP).
> 
> > The guarantees are only for natural width on Parisc as well,
> > so you would get a mess if you did byte stores to adjacent memory
> > locations.
> 
> For a simple test like:
> 
> struct x {
>   long a;
>   char b;
>   char c;
>   char d;
>   char e;
> };
> 
> void store_bc(struct x *p) {
>   p->b = 1;
>   p->c = 2;
> }
> 
> on parisc, gcc generates separate byte stores
> 
> void store_bc(struct x *p) {
>0: 34 1c 00 02 ldi 1,ret0
>4: 0f 5c 12 08 stb ret0,4(r26)
>8: 34 1c 00 04 ldi 2,ret0
>c: e8 40 c0 00 bv r0(rp)
>   10: 0f 5c 12 0a stb ret0,5(r26)
> 
> which appears to confirm that on parisc adjacent byte data
> is safe from corruption by concurrent cpu updates; that is,
> 
> CPU 0| CPU 1
>  |
> p->b = 1 | p->c = 2
>  |
> 
> will result in p->b == 1 && p->c == 2 (assume both values
> were 0 before the call to store_bc()).

What Peter said.  I would ask for suggestions for better wording, but
I would much rather be able to say that single-byte reads and writes
are atomic and that aligned-short reads and writes are also atomic.

Thus far, it looks like we lose only very old Alpha systems, so unless
I hear otherwise, I update my patch to outlaw these very old systems.

Thanx, Paul

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: bit fields && data tearing

2014-09-04 Thread Peter Hurley
Hi James,

On 09/04/2014 10:11 PM, James Bottomley wrote:
> On Thu, 2014-09-04 at 17:17 -0700, Paul E. McKenney wrote:
>> +And there are anti-guarantees:
>> +
>> + (*) These guarantees do not apply to bitfields, because compilers often
>> + generate code to modify these using non-atomic read-modify-write
>> + sequences.  Do not attempt to use bitfields to synchronize parallel
>> + algorithms.
>> +
>> + (*) Even in cases where bitfields are protected by locks, all fields
>> + in a given bitfield must be protected by one lock.  If two fields
>> + in a given bitfield are protected by different locks, the compiler's
>> + non-atomic read-modify-write sequences can cause an update to one
>> + field to corrupt the value of an adjacent field.
>> +
>> + (*) These guarantees apply only to properly aligned and sized scalar
>> + variables.  "Properly sized" currently means "int" and "long",
>> + because some CPU families do not support loads and stores of
>> + other sizes.  ("Some CPU families" is currently believed to
>> + be only Alpha 21064.  If this is actually the case, a different
>> + non-guarantee is likely to be formulated.)
> 
> This is a bit unclear.  Presumably you're talking about definiteness of
> the outcome (as in what's seen after multiple stores to the same
> variable).

No, the last conditions refers to adjacent byte stores from different
cpu contexts (either interrupt or SMP).

> The guarantees are only for natural width on Parisc as well,
> so you would get a mess if you did byte stores to adjacent memory
> locations.

For a simple test like:

struct x {
long a;
char b;
char c;
char d;
char e;
};

void store_bc(struct x *p) {
p->b = 1;
p->c = 2;
}

on parisc, gcc generates separate byte stores

void store_bc(struct x *p) {
   0:   34 1c 00 02 ldi 1,ret0
   4:   0f 5c 12 08 stb ret0,4(r26)
   8:   34 1c 00 04 ldi 2,ret0
   c:   e8 40 c0 00 bv r0(rp)
  10:   0f 5c 12 0a stb ret0,5(r26)

which appears to confirm that on parisc adjacent byte data
is safe from corruption by concurrent cpu updates; that is,

CPU 0| CPU 1
 |
p->b = 1 | p->c = 2
 |

will result in p->b == 1 && p->c == 2 (assume both values
were 0 before the call to store_bc()).

Regards,
Peter Hurley


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] IBM Akebono: Remove obsolete config select

2014-09-04 Thread Alistair Popple
On Fri, 5 Sep 2014 00:20:42 Paul Bolle wrote:



> > On Fri, 13 Jun 2014 13:56:32 Paul Bolle wrote:
> > > On Fri, 2014-05-02 at 18:06 +1000, Alistair Popple wrote:
> > > > The original implementation of MMC support for Akebono introduced a
> > > > new configuration symbol (MMC_SDHCI_OF_476GTR). This symbol has been
> > > > dropped in favour of using the generic platform driver however the
> > > > select for this symbol was mistakenly left in the platform
> > > > configuration.
> > > > 
> > > > This patch removes the obsolete symbol selection.
> > > > 
> > > > Signed-off-by: Alistair Popple 
> > > 
> > > This patch hasn't yet entered linux-next nor Linus' tree. Is it queued
> > > somewhere? If not, would a
> > > 
> > > Acked-by: Paul Bolle 
> > > 
> > > help to get this trivial patch queued for either of those trees?
> > > 
> > > 
> > > Paul Bolle
> > > 
> > > > ---
> > > > 
> > > >  arch/powerpc/platforms/44x/Kconfig | 1 -
> > > >  1 file changed, 1 deletion(-)
> > > > 
> > > > diff --git a/arch/powerpc/platforms/44x/Kconfig
> > > > b/arch/powerpc/platforms/44x/Kconfig index 8beec7d..908bf11 100644
> > > > --- a/arch/powerpc/platforms/44x/Kconfig
> > > > +++ b/arch/powerpc/platforms/44x/Kconfig
> > > > @@ -220,7 +220,6 @@ config AKEBONO
> > > > 
> > > > select USB_EHCI_HCD_PLATFORM
> > > > select MMC_SDHCI
> > > > select MMC_SDHCI_PLTFM
> > > > 
> > > > -   select MMC_SDHCI_OF_476GTR
> > > > 
> > > > select ATA
> > > > select SATA_AHCI_PLATFORM
> > > > help
> 
> This trivial cleanup is still not in linux-next nor in Linus' tree.
> Could someone else please have a look at it?
> 
> Thanks,
> 
> 
> Paul Bolle

Ben,

Any chance you could merge this? It's in patchwork (see 
http://patchwork.ozlabs.org/patch/344894/).

Regards,

Alistair

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: bit fields && data tearing

2014-09-04 Thread James Bottomley
On Thu, 2014-09-04 at 17:17 -0700, Paul E. McKenney wrote:
> +And there are anti-guarantees:
> +
> + (*) These guarantees do not apply to bitfields, because compilers often
> + generate code to modify these using non-atomic read-modify-write
> + sequences.  Do not attempt to use bitfields to synchronize parallel
> + algorithms.
> +
> + (*) Even in cases where bitfields are protected by locks, all fields
> + in a given bitfield must be protected by one lock.  If two fields
> + in a given bitfield are protected by different locks, the compiler's
> + non-atomic read-modify-write sequences can cause an update to one
> + field to corrupt the value of an adjacent field.
> +
> + (*) These guarantees apply only to properly aligned and sized scalar
> + variables.  "Properly sized" currently means "int" and "long",
> + because some CPU families do not support loads and stores of
> + other sizes.  ("Some CPU families" is currently believed to
> + be only Alpha 21064.  If this is actually the case, a different
> + non-guarantee is likely to be formulated.)

This is a bit unclear.  Presumably you're talking about definiteness of
the outcome (as in what's seen after multiple stores to the same
variable).  The guarantees are only for natural width on Parisc as well,
so you would get a mess if you did byte stores to adjacent memory
locations.  But multiple 32 bit stores guarantees to see one of the
stored values as the final outcome.

James


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: bit fields && data tearing

2014-09-04 Thread H. Peter Anvin
On 09/04/2014 05:59 PM, Peter Hurley wrote:
> I have no idea how prevalent the ev56 is compared to the ev5.
> Still we're talking about a chip that came out in 1996.

Ah yes, I stand corrected.  According to Wikipedia, the affected CPUs
were all the 2106x CPUs (EV4, EV45, LCA4, LCA45) plus the 21164 with no
suffix (EV5).  However, we're still talking about museum pieces here.

I wonder what the one I have in my garage is... I'm sure I could emulate
it faster, though.

-hpa


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: bit fields && data tearing

2014-09-04 Thread H. Peter Anvin
On 09/04/2014 05:59 PM, Peter Hurley wrote:
> I have no idea how prevalent the ev56 is compared to the ev5.
> Still we're talking about a chip that came out in 1996.

Ah yes, I stand corrected.  According to Wikipedia, the affected CPUs
were all the 2106x CPUs (EV4, EV45, LCA4, LCA45) plus the 21164 with no
suffix (EV5).  However, we're still talking about museum pieces here.

I wonder what the one I have in my garage is... I'm sure I could emulate
it faster, though.

-hpa


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: bit fields && data tearing

2014-09-04 Thread Peter Hurley
[ +cc linux-alpha ]

Hi Paul,

On 09/04/2014 08:17 PM, Paul E. McKenney wrote:
> On Thu, Sep 04, 2014 at 03:16:03PM -0700, H. Peter Anvin wrote:
>> On 09/04/2014 12:42 PM, Peter Hurley wrote:
>>>
>>> Or we could give up on the Alpha.
>>>
>>
>> If Alpha is turning into Voyager (kept alive only as a museum piece, but
>> actively causing problems) then please let's kill it.
> 
> Sorry for being slow to join this thread, but I propose the following
> patch.  If we can remove support for all CPUs that to not support
> direct access to bytes and shorts (which I would very much like to
> see happen), I will remove the last non-guarantee.
> 
>   Thanx, Paul

Although I don't mind the patch below, I don't think the bitfield thing
happened because anyone was confused about what the compiler would do;
here, it's more a case of legacy code that came out from under the
Big Kernel Lock and the bitfield was an oversight.

However, my patch to fix it by splitting the bitfield into 4 bytes
was rejected as insufficient to prevent accidental sharing. This is
what spun off the Alpha discussion about non-atomic byte updates.

FWIW, there are a bunch of problems with both the documentation and
kernel code if adjacent bytes can be overwritten by a single byte write.

Documentation/atomic-ops.txt claims that properly aligned chars are
atomic in the same sense that ints are, which is not true on the Alpha
(hopefully not a possible optimization on other arches -- I tried
with the ia64 cross compiler but it stuck with byte-sized writes).

Pretty much any large aggregate kernel structure is bound to have some
byte-size fields that are either lockless or serialized by different
locks, which may be corrupted by concurrent updates to adjacent data.
IOW, ACCESS_ONCE(), spinlocks, whatever, doesn't prevent adjacent
byte-sized data from being overwritten. I haven't bothered to count
how many global bools/chars there are and whether they might be
overwritten by adjacent updates.

Documentation/circular-buffers.txt and any lockless implementation based
on or similar to it for bytes or shorts will be corrupted if the head
nears the tail.

I'm sure there's other interesting outcomes that haven't come to light.

I think that 'naturally aligned scalar writes are atomic' should be the
minimum arch guarantee.

Regards,
Peter Hurley


> 
> 
> documentation: Record limitations of bitfields and small variables
> 
> This commit documents the fact that it is not safe to use bitfields
> as shared variables in synchronization algorithms.
> 
> Signed-off-by: Paul E. McKenney 
> 
> diff --git a/Documentation/memory-barriers.txt 
> b/Documentation/memory-barriers.txt
> index 87be0a8a78de..a28bfe4fd759 100644
> --- a/Documentation/memory-barriers.txt
> +++ b/Documentation/memory-barriers.txt
> @@ -269,6 +269,26 @@ And there are a number of things that _must_ or 
> _must_not_ be assumed:
>   STORE *(A + 4) = Y; STORE *A = X;
>   STORE {*A, *(A + 4) } = {X, Y};
>  
> +And there are anti-guarantees:
> +
> + (*) These guarantees do not apply to bitfields, because compilers often
> + generate code to modify these using non-atomic read-modify-write
> + sequences.  Do not attempt to use bitfields to synchronize parallel
> + algorithms.
> +
> + (*) Even in cases where bitfields are protected by locks, all fields
> + in a given bitfield must be protected by one lock.  If two fields
> + in a given bitfield are protected by different locks, the compiler's
> + non-atomic read-modify-write sequences can cause an update to one
> + field to corrupt the value of an adjacent field.
> +
> + (*) These guarantees apply only to properly aligned and sized scalar
> + variables.  "Properly sized" currently means "int" and "long",
> + because some CPU families do not support loads and stores of
> + other sizes.  ("Some CPU families" is currently believed to
> + be only Alpha 21064.  If this is actually the case, a different
> + non-guarantee is likely to be formulated.)
> +
>  
>  =
>  WHAT ARE MEMORY BARRIERS?
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] Make CONFIG_FHANDLE=y for all 64 bit powerpc defconfigs

2014-09-04 Thread Cyril Bur
CONFIG_FHANDLE is a requirement for systemd and with the increasing
uptake of systemd within distros it makes sense for 64 bit defconfigs
to include it.

Signed-off-by: Cyril Bur 
---
 arch/powerpc/configs/cell_defconfig  | 1 +
 arch/powerpc/configs/celleb_defconfig| 1 +
 arch/powerpc/configs/corenet64_smp_defconfig | 1 +
 arch/powerpc/configs/g5_defconfig| 1 +
 arch/powerpc/configs/maple_defconfig | 1 +
 arch/powerpc/configs/pasemi_defconfig| 1 +
 arch/powerpc/configs/ppc64_defconfig | 1 +
 arch/powerpc/configs/ppc64e_defconfig| 1 +
 arch/powerpc/configs/ps3_defconfig   | 1 +
 arch/powerpc/configs/pseries_defconfig   | 1 +
 arch/powerpc/configs/pseries_le_defconfig| 1 +
 11 files changed, 11 insertions(+)

diff --git a/arch/powerpc/configs/cell_defconfig 
b/arch/powerpc/configs/cell_defconfig
index 4bee1a6..45fd06c 100644
--- a/arch/powerpc/configs/cell_defconfig
+++ b/arch/powerpc/configs/cell_defconfig
@@ -5,6 +5,7 @@ CONFIG_SMP=y
 CONFIG_NR_CPUS=4
 CONFIG_EXPERIMENTAL=y
 CONFIG_SYSVIPC=y
+CONFIG_FHANDLE=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=15
diff --git a/arch/powerpc/configs/celleb_defconfig 
b/arch/powerpc/configs/celleb_defconfig
index 6d7b22f..77d7bf3 100644
--- a/arch/powerpc/configs/celleb_defconfig
+++ b/arch/powerpc/configs/celleb_defconfig
@@ -5,6 +5,7 @@ CONFIG_SMP=y
 CONFIG_NR_CPUS=4
 CONFIG_EXPERIMENTAL=y
 CONFIG_SYSVIPC=y
+CONFIG_FHANDLE=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=15
diff --git a/arch/powerpc/configs/corenet64_smp_defconfig 
b/arch/powerpc/configs/corenet64_smp_defconfig
index 4b07bad..269d6e4 100644
--- a/arch/powerpc/configs/corenet64_smp_defconfig
+++ b/arch/powerpc/configs/corenet64_smp_defconfig
@@ -4,6 +4,7 @@ CONFIG_ALTIVEC=y
 CONFIG_SMP=y
 CONFIG_NR_CPUS=24
 CONFIG_SYSVIPC=y
+CONFIG_FHANDLE=y
 CONFIG_IRQ_DOMAIN_DEBUG=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
diff --git a/arch/powerpc/configs/g5_defconfig 
b/arch/powerpc/configs/g5_defconfig
index 3c72fa6..7594c5a 100644
--- a/arch/powerpc/configs/g5_defconfig
+++ b/arch/powerpc/configs/g5_defconfig
@@ -5,6 +5,7 @@ CONFIG_NR_CPUS=4
 CONFIG_EXPERIMENTAL=y
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_FHANDLE=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_BLK_DEV_INITRD=y
diff --git a/arch/powerpc/configs/maple_defconfig 
b/arch/powerpc/configs/maple_defconfig
index 95e545d..c8b6a9d 100644
--- a/arch/powerpc/configs/maple_defconfig
+++ b/arch/powerpc/configs/maple_defconfig
@@ -4,6 +4,7 @@ CONFIG_NR_CPUS=4
 CONFIG_EXPERIMENTAL=y
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_FHANDLE=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 # CONFIG_COMPAT_BRK is not set
diff --git a/arch/powerpc/configs/pasemi_defconfig 
b/arch/powerpc/configs/pasemi_defconfig
index cec044a..e5e7838 100644
--- a/arch/powerpc/configs/pasemi_defconfig
+++ b/arch/powerpc/configs/pasemi_defconfig
@@ -3,6 +3,7 @@ CONFIG_ALTIVEC=y
 CONFIG_SMP=y
 CONFIG_NR_CPUS=2
 CONFIG_SYSVIPC=y
+CONFIG_FHANDLE=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_BLK_DEV_INITRD=y
diff --git a/arch/powerpc/configs/ppc64_defconfig 
b/arch/powerpc/configs/ppc64_defconfig
index f26b267..f6c02f8 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -4,6 +4,7 @@ CONFIG_VSX=y
 CONFIG_SMP=y
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_FHANDLE=y
 CONFIG_IRQ_DOMAIN_DEBUG=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
diff --git a/arch/powerpc/configs/ppc64e_defconfig 
b/arch/powerpc/configs/ppc64e_defconfig
index 438e813..587f551 100644
--- a/arch/powerpc/configs/ppc64e_defconfig
+++ b/arch/powerpc/configs/ppc64e_defconfig
@@ -3,6 +3,7 @@ CONFIG_PPC_BOOK3E_64=y
 CONFIG_SMP=y
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_FHANDLE=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_TASKSTATS=y
diff --git a/arch/powerpc/configs/ps3_defconfig 
b/arch/powerpc/configs/ps3_defconfig
index fdee37f..2e637c8 100644
--- a/arch/powerpc/configs/ps3_defconfig
+++ b/arch/powerpc/configs/ps3_defconfig
@@ -5,6 +5,7 @@ CONFIG_SMP=y
 CONFIG_NR_CPUS=2
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_FHANDLE=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_RD_LZMA=y
diff --git a/arch/powerpc/configs/pseries_defconfig 
b/arch/powerpc/configs/pseries_defconfig
index a905063..50375f1 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -5,6 +5,7 @@ CONFIG_SMP=y
 CONFIG_NR_CPUS=2048
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_FHANDLE=y
 CONFIG_AUDIT=y
 CONFIG_AUDITSYSCALL=y
 CONFIG_IRQ_DOMAIN_DEBUG=y
diff --git a/arch/powerpc/configs/pseries_le_defconfig 
b/arch/powerpc/configs/pseries_le_defconfig
index 58e3dbf..4428ee4 100644
--- a/arch/powerpc/configs/pseries_le_defconfig
+++ b/arch/powerpc/configs/pseries_le_defconfig
@@ -6,6 +6,7 @@ CONFIG_NR_CPUS=2048
 CONFIG_CPU_LITTLE_ENDIAN=y
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_FHANDLE=y
 CONFIG_AUDIT=

Re: bit fields && data tearing

2014-09-04 Thread Peter Hurley
[ +cc linux-alpha ]

On 09/04/2014 06:14 PM, H. Peter Anvin wrote:
> On 09/04/2014 02:52 AM, Benjamin Herrenschmidt wrote:
>>
>> Yeah correct, alpha and bytes right ? Is there any other ? That's why I
>> suggested int.
>>
> 
> Even for Alpha it is only the 21064 AFAIK.

For -mcpu=ev5 (21164) and the following test

struct x {
long a;
char b;
char c;
char d;
char e;
};

void store_b(struct x *p) {
p->b = 1;
}

gcc generates:

void store_b(struct x *p) {
   0:   08 00 30 a0 ldl t0,8(a0)
   4:   01 f1 3f 44 andnot  t0,0xff,t0
   8:   01 34 20 44 or  t0,0x1,t0
   c:   08 00 30 b0 stl t0,8(a0)
  10:   01 80 fa 6b ret

IOW, rmw on 3 adjacent bytes, which is bad :)
For -mcpu=ev56 (21164A), the generated code is:

void store_b(struct x *p) {
   0:   01 00 3f 20 lda t0,1
   4:   08 00 30 38 stb t0,8(a0)
   8:   01 80 fa 6b ret

which is ok.
I have no idea how prevalent the ev56 is compared to the ev5.
Still we're talking about a chip that came out in 1996.

I still hate split caches though.

Regards,
Peter Hurley

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 2/2] sched: BUG when stack end location is over written

2014-09-04 Thread Aaron Tomlin
On Thu, Sep 04, 2014 at 05:32:31PM +0200, Peter Zijlstra wrote:
> On Thu, Sep 04, 2014 at 03:50:24PM +0100, Aaron Tomlin wrote:
> > Currently in the event of a stack overrun a call to schedule()
> > does not check for this type of corruption. This corruption is
> > often silent and can go unnoticed. However once the corrupted
> > region is examined at a later stage, the outcome is undefined
> > and often results in a sporadic page fault which cannot be
> > handled.
> > 
> > This patch checks for a stack overrun and takes appropriate
> > action since the damage is already done, there is no point
> > in continuing.
> > 
> > Signed-off-by: Aaron Tomlin 
> > ---
> >  kernel/sched/core.c | 3 +++
> >  1 file changed, 3 insertions(+)
> > 
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index ec1a286..d6af6a0 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -2660,6 +2660,9 @@ static noinline void __schedule_bug(struct 
> > task_struct *prev)
> >   */
> >  static inline void schedule_debug(struct task_struct *prev)
> >  {
> > +   if (unlikely(prev != &init_task &&
> > +   task_stack_end_corrupted(prev)))
> > +   BUG();
> 
> superfluous linebreak, also we appear to have BUG_ON() for situations
> just like these.
> 
> secondly, while I appreciate the 'feature' you're making schedule()
> slower for everybody, what do you propose to do about that?

Understood. I will wrap this with a suitable Kconfig option.

-- 
Aaron Tomlin
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 1/2] sched: Add helper for task stack page overrun checking

2014-09-04 Thread Aaron Tomlin
On Thu, Sep 04, 2014 at 05:02:34PM +0200, Oleg Nesterov wrote:
> On 09/04, Aaron Tomlin wrote:
> >
> > +#define task_stack_end_corrupted(task) \
> > +   (*(end_of_stack(task)) != STACK_END_MAGIC)
> 
> and it is always used along with "tsk != init_task".
> 
> But why we exclude swapper/0? Can't we add
> 
>   end_of_stack(current) = STACK_END_MAGIC;
> 
> somewhere at the start of start_kernel() ?

Good point. I can look into it.

> If not, perhaps this new helper should check "task != &init_task"
> itself so that we can simplify its users?
> 
> Oleg.
> 
> >  
> >  static inline int object_is_on_stack(void *obj)
> >  {
> > diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
> > index 8a4e5cb..06c7390 100644
> > --- a/kernel/trace/trace_stack.c
> > +++ b/kernel/trace/trace_stack.c
> > @@ -13,7 +13,6 @@
> >  #include 
> >  #include 
> >  #include 
> > -#include 
> >  
> >  #include 
> >  
> > @@ -171,8 +170,8 @@ check_stack(unsigned long ip, unsigned long *stack)
> > i++;
> > }
> >  
> > -   if ((current != &init_task &&
> > -   *(end_of_stack(current)) != STACK_END_MAGIC)) {
> > +   if (current != &init_task &&
> > +   task_stack_end_corrupted(current)) {
> > print_max_stack();
> > BUG();
> > }
> > -- 
> > 1.9.3
> > 
> 

-- 
Aaron Tomlin
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 2/2] sched: BUG when stack end location is over written

2014-09-04 Thread Aaron Tomlin
Currently in the event of a stack overrun a call to schedule()
does not check for this type of corruption. This corruption is
often silent and can go unnoticed. However once the corrupted
region is examined at a later stage, the outcome is undefined
and often results in a sporadic page fault which cannot be
handled.

This patch checks for a stack overrun and takes appropriate
action since the damage is already done, there is no point
in continuing.

Signed-off-by: Aaron Tomlin 
---
 kernel/sched/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ec1a286..d6af6a0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2660,6 +2660,9 @@ static noinline void __schedule_bug(struct task_struct 
*prev)
  */
 static inline void schedule_debug(struct task_struct *prev)
 {
+   if (unlikely(prev != &init_task &&
+   task_stack_end_corrupted(prev)))
+   BUG();
/*
 * Test if we are atomic. Since do_exit() needs to call into
 * schedule() atomically, we ignore that path. Otherwise whine
-- 
1.9.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 0/2] sched: Always check the integrity of the canary

2014-09-04 Thread Aaron Tomlin
Currently in the event of a stack overrun a call to schedule()
does not check for this type of corruption. This corruption is
often silent and can go unnoticed. However once the corrupted
region is examined at a later stage, the outcome is undefined
and often results in a sporadic page fault which cannot be
handled.

The first patch provides a helper to determine the integrity
of the canary. While the second patch checks for a stack
overrun and takes appropriate action since the damage is
already done, there is no point in continuing.

Aaron Tomlin (2):
  sched: Add helper for task stack page overrun checking
  sched: BUG when stack end location is over written

 arch/powerpc/mm/fault.c| 6 ++
 arch/x86/mm/fault.c| 5 +
 include/linux/sched.h  | 3 +++
 kernel/sched/core.c| 3 +++
 kernel/trace/trace_stack.c | 5 ++---
 5 files changed, 11 insertions(+), 11 deletions(-)

-- 
1.9.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 1/2] sched: Add helper for task stack page overrun checking

2014-09-04 Thread Aaron Tomlin
This facility is used in a few places so let's introduce
a helper function to improve readability.

Signed-off-by: Aaron Tomlin 
---
 arch/powerpc/mm/fault.c| 6 ++
 arch/x86/mm/fault.c| 5 +
 include/linux/sched.h  | 3 +++
 kernel/trace/trace_stack.c | 5 ++---
 4 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 51ab9e7..5cffc7c 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -30,7 +30,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
@@ -508,7 +507,6 @@ bail:
 void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
 {
const struct exception_table_entry *entry;
-   unsigned long *stackend;
 
/* Are we prepared to handle this fault?  */
if ((entry = search_exception_tables(regs->nip)) != NULL) {
@@ -537,8 +535,8 @@ void bad_page_fault(struct pt_regs *regs, unsigned long 
address, int sig)
printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
regs->nip);
 
-   stackend = end_of_stack(current);
-   if (current != &init_task && *stackend != STACK_END_MAGIC)
+   if (current != &init_task &&
+   task_stack_end_corrupted(current))
printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
 
die("Kernel access of bad area", regs, sig);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a241946..b5b9c09 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -3,7 +3,6 @@
  *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
  *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
  */
-#include/* STACK_END_MAGIC  */
 #include/* test_thread_flag(), ...  */
 #include   /* oops_begin/end, ...  */
 #include   /* search_exception_table   */
@@ -649,7 +648,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
   unsigned long address, int signal, int si_code)
 {
struct task_struct *tsk = current;
-   unsigned long *stackend;
unsigned long flags;
int sig;
 
@@ -709,8 +707,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 
show_fault_oops(regs, error_code, address);
 
-   stackend = end_of_stack(tsk);
-   if (tsk != &init_task && *stackend != STACK_END_MAGIC)
+   if (tsk != &init_task && task_stack_end_corrupted(tsk))
printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
 
tsk->thread.cr2 = address;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c2c885..36d93aa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -57,6 +57,7 @@ struct sched_param {
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -2614,6 +2615,8 @@ static inline unsigned long *end_of_stack(struct 
task_struct *p)
 }
 
 #endif
+#define task_stack_end_corrupted(task) \
+   (*(end_of_stack(task)) != STACK_END_MAGIC)
 
 static inline int object_is_on_stack(void *obj)
 {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8a4e5cb..06c7390 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,7 +13,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #include 
 
@@ -171,8 +170,8 @@ check_stack(unsigned long ip, unsigned long *stack)
i++;
}
 
-   if ((current != &init_task &&
-   *(end_of_stack(current)) != STACK_END_MAGIC)) {
+   if (current != &init_task &&
+   task_stack_end_corrupted(current)) {
print_max_stack();
BUG();
}
-- 
1.9.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: bit fields && data tearing

2014-09-04 Thread Paul E. McKenney
On Thu, Sep 04, 2014 at 03:16:03PM -0700, H. Peter Anvin wrote:
> On 09/04/2014 12:42 PM, Peter Hurley wrote:
> > 
> > Or we could give up on the Alpha.
> > 
> 
> If Alpha is turning into Voyager (kept alive only as a museum piece, but
> actively causing problems) then please let's kill it.

Sorry for being slow to join this thread, but I propose the following
patch.  If we can remove support for all CPUs that to not support
direct access to bytes and shorts (which I would very much like to
see happen), I will remove the last non-guarantee.

Thanx, Paul



documentation: Record limitations of bitfields and small variables

This commit documents the fact that it is not safe to use bitfields
as shared variables in synchronization algorithms.

Signed-off-by: Paul E. McKenney 

diff --git a/Documentation/memory-barriers.txt 
b/Documentation/memory-barriers.txt
index 87be0a8a78de..a28bfe4fd759 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -269,6 +269,26 @@ And there are a number of things that _must_ or _must_not_ 
be assumed:
STORE *(A + 4) = Y; STORE *A = X;
STORE {*A, *(A + 4) } = {X, Y};
 
+And there are anti-guarantees:
+
+ (*) These guarantees do not apply to bitfields, because compilers often
+ generate code to modify these using non-atomic read-modify-write
+ sequences.  Do not attempt to use bitfields to synchronize parallel
+ algorithms.
+
+ (*) Even in cases where bitfields are protected by locks, all fields
+ in a given bitfield must be protected by one lock.  If two fields
+ in a given bitfield are protected by different locks, the compiler's
+ non-atomic read-modify-write sequences can cause an update to one
+ field to corrupt the value of an adjacent field.
+
+ (*) These guarantees apply only to properly aligned and sized scalar
+ variables.  "Properly sized" currently means "int" and "long",
+ because some CPU families do not support loads and stores of
+ other sizes.  ("Some CPU families" is currently believed to
+ be only Alpha 21064.  If this is actually the case, a different
+ non-guarantee is likely to be formulated.)
+
 
 =
 WHAT ARE MEMORY BARRIERS?

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2] QE: move qe code from arch/powerpc to drivers/soc

2014-09-04 Thread Scott Wood
On Thu, 2014-09-04 at 13:06 +0800, Zhao Qiang wrote:
> LS1 is arm cpu and it has qe ip block.
> move qe code from platform directory to public directory.
> 
> QE is an IP block integrates several comunications peripheral
> controllers. It can implement a variety of applications, such
> as uart, usb and tdm and so on.
> 
> Signed-off-by: Zhao Qiang 
> ---
> Changes for v2:
>   - mv code to drivers/soc

Who will be the maintainer of this code once it lives in drivers/soc,
especially once it is no longer used only by PPC?

>  44 files changed, 113 insertions(+), 113 deletions(-)
>  delete mode 100644 arch/powerpc/sysdev/qe_lib/Kconfig
>  create mode 100644 drivers/soc/qe/Kconfig
>  rename {arch/powerpc/sysdev/qe_lib => drivers/soc/qe}/Makefile (100%)
>  rename {arch/powerpc/sysdev/qe_lib => drivers/soc/qe}/gpio.c (99%)
>  rename {arch/powerpc/sysdev/qe_lib => drivers/soc/qe}/qe.c (99%)
>  rename {arch/powerpc/sysdev/qe_lib => drivers/soc/qe}/qe_ic.c (99%)
>  rename {arch/powerpc/sysdev/qe_lib => drivers/soc/qe}/qe_ic.h (98%)
>  rename {arch/powerpc/sysdev/qe_lib => drivers/soc/qe}/qe_io.c (99%)
>  rename {arch/powerpc/sysdev/qe_lib => drivers/soc/qe}/ucc.c (98%)
>  rename {arch/powerpc/sysdev/qe_lib => drivers/soc/qe}/ucc_fast.c (98%)
>  rename {arch/powerpc/sysdev/qe_lib => drivers/soc/qe}/ucc_slow.c (98%)
>  rename {arch/powerpc/sysdev/qe_lib => drivers/soc/qe}/usb.c (96%)

drivers/soc/fsl-qe

> diff --git a/drivers/soc/Makefile b/drivers/soc/Makefile
> index 0f7c447..5da1a482 100644
> --- a/drivers/soc/Makefile
> +++ b/drivers/soc/Makefile
> @@ -3,3 +3,5 @@
>  #
>  
>  obj-$(CONFIG_ARCH_QCOM)  += qcom/
> +
> +obj-$(CONFIG_QUICC_ENGINE) += qe/

Please keep the file consistent regarding tabs versus spaces.

Plus, why do you need a newline between them?

> diff --git a/drivers/soc/qe/Kconfig b/drivers/soc/qe/Kconfig
> new file mode 100644
> index 000..8b03ca2
> --- /dev/null
> +++ b/drivers/soc/qe/Kconfig
> @@ -0,0 +1,45 @@
> +#
> +# QE Communication options
> +#
> +config QUICC_ENGINE
> + bool "Freescale QUICC Engine (QE) Support"
> + depends on FSL_SOC && PPC32
> + select PPC_LIB_RHEAP
> + select CRC32
> + help
> +   The QUICC Engine (QE) is a new generation of communications
> +   coprocessors on Freescale embedded CPUs (akin to CPM in older chips).
> +   Selecting this option means that you wish to build a kernel
> +   for a machine with a QE coprocessor.
> +
> +config QE_GPIO
> + bool "QE GPIO support"
> + depends on QUICC_ENGINE
> + select ARCH_REQUIRE_GPIOLIB
> + help
> +   Say Y here if you're going to use hardware that connects to the
> +   QE GPIOs.
> +
> +config UCC_SLOW
> + bool
> + default y if SERIAL_QE
> + help
> +   This option provides qe_lib support to UCC slow
> +   protocols: UART, BISYNC, QMC
> +
> +config UCC_FAST
> + bool
> + default y if UCC_GETH
> + help
> +   This option provides qe_lib support to UCC fast
> +   protocols: HDLC, Ethernet, ATM, transparent
> +
> +config UCC
> + bool
> + default y if UCC_FAST || UCC_SLOW
> +
> +config QE_USB
> + bool
> + default y if USB_FSL_QE
> + help
> +   QE USB Controller support

First could we give these names better namespacing?

-Scott


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: bit fields && data tearing

2014-09-04 Thread H. Peter Anvin
On 09/04/2014 12:42 PM, Peter Hurley wrote:
> 
> Or we could give up on the Alpha.
> 

If Alpha is turning into Voyager (kept alive only as a museum piece, but
actively causing problems) then please let's kill it.

-hpa


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: bit fields && data tearing

2014-09-04 Thread H. Peter Anvin
On 09/04/2014 02:52 AM, Benjamin Herrenschmidt wrote:
> 
> Yeah correct, alpha and bytes right ? Is there any other ? That's why I
> suggested int.
> 

Even for Alpha it is only the 21064 AFAIK.

-hpa


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [4/5] IBM Akebono: Add the Akebono platform

2014-09-04 Thread Paul Bolle
Hi Alistair,

On Fri, 2014-05-02 at 10:35 +1000, Alistair Popple wrote:
> On Thu, 1 May 2014 11:27:27 Paul Bolle wrote:
> > On Thu, 2014-03-06 at 14:52 +1100, Alistair Popple wrote:
> > > This patch adds support for the IBM Akebono board.
> > > + select IBM_EMAC_RGMII_WOL
> > 
> > The patch that added this symbol (and the related driver) was submitted
> > in https://lkml.org/lkml/2014/2/21/25 . It's not (yet) included in
> > linux-next. Is it queued somewhere else?
> 
> To be honest I'm not sure. I will follow this up on the netdev list.

I noticed that this symbol (and its driver) are still not in v3.17-rc3
nor in next-20140904. Any update on this?

Thanks,


Paul Bolle

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] IBM Akebono: Remove obsolete config select

2014-09-04 Thread Paul Bolle
On Fri, 2014-06-20 at 10:10 +1000, Alistair Popple wrote:
> Hi Ben,
> 
> It looks like we may have missed this trivial fix? Can you please apply it to 
> your tree?
> 
> Regards,
> 
> Alistair
> 
> On Fri, 13 Jun 2014 13:56:32 Paul Bolle wrote:
> > On Fri, 2014-05-02 at 18:06 +1000, Alistair Popple wrote:
> > > The original implementation of MMC support for Akebono introduced a
> > > new configuration symbol (MMC_SDHCI_OF_476GTR). This symbol has been
> > > dropped in favour of using the generic platform driver however the
> > > select for this symbol was mistakenly left in the platform
> > > configuration.
> > > 
> > > This patch removes the obsolete symbol selection.
> > > 
> > > Signed-off-by: Alistair Popple 
> > 
> > This patch hasn't yet entered linux-next nor Linus' tree. Is it queued
> > somewhere? If not, would a
> > Acked-by: Paul Bolle 
> > 
> > help to get this trivial patch queued for either of those trees?
> > 
> > 
> > Paul Bolle
> > 
> > > ---
> > > 
> > >  arch/powerpc/platforms/44x/Kconfig | 1 -
> > >  1 file changed, 1 deletion(-)
> > > 
> > > diff --git a/arch/powerpc/platforms/44x/Kconfig
> > > b/arch/powerpc/platforms/44x/Kconfig index 8beec7d..908bf11 100644
> > > --- a/arch/powerpc/platforms/44x/Kconfig
> > > +++ b/arch/powerpc/platforms/44x/Kconfig
> > > @@ -220,7 +220,6 @@ config AKEBONO
> > > 
> > >   select USB_EHCI_HCD_PLATFORM
> > >   select MMC_SDHCI
> > >   select MMC_SDHCI_PLTFM
> > > 
> > > - select MMC_SDHCI_OF_476GTR
> > > 
> > >   select ATA
> > >   select SATA_AHCI_PLATFORM
> > >   help

This trivial cleanup is still not in linux-next nor in Linus' tree.
Could someone else please have a look at it?

Thanks,


Paul Bolle

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2] iommu/fsl: Fix warning resulting from adding PCI device twice

2014-09-04 Thread Emil Medve
On 09/04/2014 06:38 AM, Varun Sethi wrote:
> iommu_group_get_for_dev determines the iommu group for the PCI device and adds
> the device to the group.
> 
> In the PAMU driver we were again adding the device to the same group without 
> checking
> if the device already had an iommu group. This resulted in the following 
> warning.
> 
> sysfs: cannot create duplicate filename 
> '/devices/ffe20.pcie/pci:00/:00:00.0/iommu_group'
> [ cut here ]
> WARNING: at fs/sysfs/dir.c:31
> Modules linked in:
> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.17.0-rc3-2-g7505cea-dirty #126
> task: c001fe0a ti: c001fe044000 task.ti: c001fe044000
> NIP: c018879c LR: c0188798 CTR: c001ea50
> REGS: c001fe047040 TRAP: 0700   Not tainted  
> (3.17.0-rc3-2-g7505cea-dirty)
> MSR: 80029000   CR: 24ad8e22  XER: 2000
> SOFTE: 1 
> GPR00: c0188798 c001fe0472c0 c09a52e0 0065 
> GPR04: 0001  3a30303a 2700 
> GPR08: 2f696f6d c08d3830 c09b3938 c09bb3d0 
> GPR12: 28ad8e24 cfff4000 c000205c  
> GPR16:     
> GPR20:    c08a4c70 
> GPR24: c07e9010 c001fe0140a8 ffef 0001 
> GPR28: c001fe22ebb8 c07e9010 c090bf10 c001fe22 
> NIP [c018879c] .sysfs_warn_dup+0x74/0xa4
> LR [c0188798] .sysfs_warn_dup+0x70/0xa4
> Call Trace:
> [c001fe0472c0] [c0188798] .sysfs_warn_dup+0x70/0xa4 (unreliable)
> [c001fe047350] [c0188d34] 
> .sysfs_do_create_link_sd.clone.2+0x168/0x174
> [c001fe047400] [c04b3cf8] .iommu_group_add_device+0x78/0x244
> [c001fe0474b0] [c04b6964] .fsl_pamu_add_device+0x88/0x1a8
> [c001fe047570] [c04b3960] .iommu_bus_notifier+0xdc/0x15c
> [c001fe047600] [c0059848] .notifier_call_chain+0x8c/0xe8
> [c001fe0476a0] [c0059d04] 
> .__blocking_notifier_call_chain+0x58/0x84
> [c001fe047750] [c036619c] .device_add+0x464/0x5c8
> [c001fe047820] [c0300ebc] .pci_device_add+0x14c/0x17c
> [c001fe0478c0] [c0300fbc] .pci_scan_single_device+0xd0/0xf4
> [c001fe047970] [c030104c] .pci_scan_slot+0x6c/0x18c
> [c001fe047a10] [c030226c] .pci_scan_child_bus+0x40/0x114
> [c001fe047ac0] [c0021974] .pcibios_scan_phb+0x240/0x2c8
> [c001fe047b70] [c085a970] .pcibios_init+0x64/0xc8
> [c001fe047c00] [c0001884] .do_one_initcall+0xbc/0x224
> [c001fe047d00] [c0852d50] .kernel_init_freeable+0x14c/0x21c
> [c001fe047db0] [c0002078] .kernel_init+0x1c/0xfa4
> [c001fe047e30] [c884] .ret_from_kernel_thread+0x58/0xd4
> Instruction dump:
> 7c7f1b79 4182001c 7fe4fb78 7f83e378 38a01000 4bffc905 6000 7c641b78 
> e87e8008 7fa5eb78 48482ff5 6000 <0fe0> 7fe3fb78 4bf7bd39 6000 
> 
> 
> 
> Signed-off-by: Varun Sethi 

Tested-by: Emil Medve 
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: bit fields && data tearing

2014-09-04 Thread Peter Hurley
On 09/04/2014 12:50 PM, One Thousand Gnomes wrote:
>> Besides updating the documentation, it may make sense to do something
>> arch-specific. Just bumping out storage on arches that don't need it
>> seems wasteful, as does generating bus locks on arches that don't need it.
>> Unfortunately, the code churn looks unavoidable.
> 
> The arch specific is pretty much set_bit and friends. Bus locks on a
> locally owned cache line should not be very expensive on anything vaguely
> modern, while uniprocessor boxes usually only have to generate set_bit
> as a single instruction so it is interrupt safe.

Or we could give up on the Alpha.

It's not just the non-atomic bytes; we could do away with the
read_barrier_depends() which hardly any code gets correctly anyway.

Regards,
Peter Hurley

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: bit fields && data tearing

2014-09-04 Thread One Thousand Gnomes
> Besides updating the documentation, it may make sense to do something
> arch-specific. Just bumping out storage on arches that don't need it
> seems wasteful, as does generating bus locks on arches that don't need it.
> Unfortunately, the code churn looks unavoidable.

The arch specific is pretty much set_bit and friends. Bus locks on a
locally owned cache line should not be very expensive on anything vaguely
modern, while uniprocessor boxes usually only have to generate set_bit
as a single instruction so it is interrupt safe.

Alan
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 2/2] sched: BUG when stack end location is over written

2014-09-04 Thread Peter Zijlstra
On Thu, Sep 04, 2014 at 03:50:24PM +0100, Aaron Tomlin wrote:
> Currently in the event of a stack overrun a call to schedule()
> does not check for this type of corruption. This corruption is
> often silent and can go unnoticed. However once the corrupted
> region is examined at a later stage, the outcome is undefined
> and often results in a sporadic page fault which cannot be
> handled.
> 
> This patch checks for a stack overrun and takes appropriate
> action since the damage is already done, there is no point
> in continuing.
> 
> Signed-off-by: Aaron Tomlin 
> ---
>  kernel/sched/core.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index ec1a286..d6af6a0 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2660,6 +2660,9 @@ static noinline void __schedule_bug(struct task_struct 
> *prev)
>   */
>  static inline void schedule_debug(struct task_struct *prev)
>  {
> + if (unlikely(prev != &init_task &&
> + task_stack_end_corrupted(prev)))
> + BUG();

superfluous linebreak, also we appear to have BUG_ON() for situations
just like these.

secondly, while I appreciate the 'feature' you're making schedule()
slower for everybody, what do you propose to do about that?
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 1/2] sched: Add helper for task stack page overrun checking

2014-09-04 Thread Peter Zijlstra
On Thu, Sep 04, 2014 at 03:50:23PM +0100, Aaron Tomlin wrote:

> @@ -537,8 +535,8 @@ void bad_page_fault(struct pt_regs *regs, unsigned long 
> address, int sig)
>   printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
>   regs->nip);
>  
> - stackend = end_of_stack(current);
> - if (current != &init_task && *stackend != STACK_END_MAGIC)
> + if (current != &init_task &&
> + task_stack_end_corrupted(current))

superfluous linebreak.

> @@ -2614,6 +2615,8 @@ static inline unsigned long *end_of_stack(struct 
> task_struct *p)
>  }
>  
>  #endif

Here otoh we could do with some extra whitespace

> @@ -171,8 +170,8 @@ check_stack(unsigned long ip, unsigned long *stack)
>   i++;
>   }
>  
> - if ((current != &init_task &&
> - *(end_of_stack(current)) != STACK_END_MAGIC)) {
> + if (current != &init_task &&
> + task_stack_end_corrupted(current)) {

Again, superfluous linebreak.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 1/2] sched: Add helper for task stack page overrun checking

2014-09-04 Thread Oleg Nesterov
On 09/04, Aaron Tomlin wrote:
>
> +#define task_stack_end_corrupted(task) \
> + (*(end_of_stack(task)) != STACK_END_MAGIC)

and it is always used along with "tsk != init_task".

But why we exclude swapper/0? Can't we add

end_of_stack(current) = STACK_END_MAGIC;

somewhere at the start of start_kernel() ?

If not, perhaps this new helper should check "task != &init_task"
itself so that we can simplify its users?

Oleg.

>  
>  static inline int object_is_on_stack(void *obj)
>  {
> diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
> index 8a4e5cb..06c7390 100644
> --- a/kernel/trace/trace_stack.c
> +++ b/kernel/trace/trace_stack.c
> @@ -13,7 +13,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
>  
>  #include 
>  
> @@ -171,8 +170,8 @@ check_stack(unsigned long ip, unsigned long *stack)
>   i++;
>   }
>  
> - if ((current != &init_task &&
> - *(end_of_stack(current)) != STACK_END_MAGIC)) {
> + if (current != &init_task &&
> + task_stack_end_corrupted(current)) {
>   print_max_stack();
>   BUG();
>   }
> -- 
> 1.9.3
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: TTM placement & caching issue/questions

2014-09-04 Thread Thomas Hellstrom
On 09/04/2014 11:43 AM, Benjamin Herrenschmidt wrote:
> On Thu, 2014-09-04 at 11:34 +0200, Daniel Vetter wrote:
>> On Thu, Sep 04, 2014 at 09:44:04AM +0200, Thomas Hellstrom wrote:
>>> Last time I tested, (and it seems like Michel is on the same track),
>>> writing with the CPU to write-combined memory was substantially faster
>>> than writing to cached memory, with the additional side-effect that CPU
>>> caches are left unpolluted.
>>>
>>> Moreover (although only tested on Intel's embedded chipsets), texturing
>>> from cpu-cache-coherent PCI memory was a real GPU performance hog
>>> compared to texturing from non-snooped memory. Hence, whenever a buffer
>>> could be classified as GPU-read-only (or almost at least), it should be
>>> placed in write-combined memory.
>> Just a quick comment since this explicitly referes to intel chips: On
>> desktop/laptop chips with the big shared l3/l4 caches it's the other way
>> round. Cached uploads are substantially faster than wc and not using
>> coherent access is a severe perf hit for texturing. I guess the hw guys
>> worked really hard to hide the snooping costs so that the gpu can benefit
>> from the massive bandwidth these caches can provide.
> This is similar to modern POWER chips as well. We have pretty big L3's
> (though not technically shared they are in a separate quadrant and we
> have a shared L4 in the memory buffer) and our fabric is generally
> optimized for cachable/coherent access performance. In fact, we only
> have so many credits for NC accesses on the bus...
>

Thanks both of you for the update. I haven't dealt with real hardware
for a while..

/Thomas

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: TTM placement & caching issue/questions

2014-09-04 Thread Daniel Vetter
On Thu, Sep 04, 2014 at 09:44:04AM +0200, Thomas Hellstrom wrote:
> Last time I tested, (and it seems like Michel is on the same track),
> writing with the CPU to write-combined memory was substantially faster
> than writing to cached memory, with the additional side-effect that CPU
> caches are left unpolluted.
> 
> Moreover (although only tested on Intel's embedded chipsets), texturing
> from cpu-cache-coherent PCI memory was a real GPU performance hog
> compared to texturing from non-snooped memory. Hence, whenever a buffer
> could be classified as GPU-read-only (or almost at least), it should be
> placed in write-combined memory.

Just a quick comment since this explicitly referes to intel chips: On
desktop/laptop chips with the big shared l3/l4 caches it's the other way
round. Cached uploads are substantially faster than wc and not using
coherent access is a severe perf hit for texturing. I guess the hw guys
worked really hard to hide the snooping costs so that the gpu can benefit
from the massive bandwidth these caches can provide.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: bit fields && data tearing

2014-09-04 Thread Mikael Pettersson
Benjamin Herrenschmidt writes:
 > On Wed, 2014-09-03 at 18:51 -0400, Peter Hurley wrote:
 > 
 > > Apologies for hijacking this thread but I need to extend this discussion
 > > somewhat regarding what a compiler might do with adjacent fields in a 
 > > structure.
 > > 
 > > The tty subsystem defines a large aggregate structure, struct tty_struct.
 > > Importantly, several different locks apply to different fields within that
 > > structure; ie., a specific spinlock will be claimed before updating or 
 > > accessing
 > > certain fields while a different spinlock will be claimed before updating 
 > > or
 > > accessing certain _adjacent_ fields.
 > > 
 > > What is necessary and sufficient to prevent accidental false-sharing?
 > > The patch below was flagged as insufficient on ia64, and possibly ARM.
 > 
 > We expect native aligned scalar types to be accessed atomically (the
 > read/modify/write of a larger quantity that gcc does on some bitfield
 > cases has been flagged as a gcc bug, but shouldn't happen on normal
 > scalar types).
 > 
 > I am not 100% certain of "bool" here, I assume it's treated as a normal
 > scalar and thus atomic but if unsure, you can always use int.

Please use an aligned int or long.  Some machines cannot do atomic
accesses on sub-int/long quantities, so 'bool' may cause unexpected
rmw cycles on adjacent fields.

/Mikael

 > 
 > Another option is to use the atomic bitops and make these bits in a
 > bitmask but that is probably unnecessary if you have locks already.
 > 
 > Cheers,
 > Ben.
 > 
 > 
 > > Regards,
 > > Peter Hurley
 > > 
 > > --- >% ---
 > > Subject: [PATCH 21/26] tty: Convert tty_struct bitfield to bools
 > > 
 > > The stopped, hw_stopped, flow_stopped and packet bits are smp-unsafe
 > > and interrupt-unsafe. For example,
 > > 
 > > CPU 0 | CPU 1
 > >   |
 > > tty->flow_stopped = 1 | tty->hw_stopped = 0
 > > 
 > > One of these updates will be corrupted, as the bitwise operation
 > > on the bitfield is non-atomic.
 > > 
 > > Ensure each flag has a separate memory location, so concurrent
 > > updates do not corrupt orthogonal states.
 > > 
 > > Signed-off-by: Peter Hurley 
 > > ---
 > >  include/linux/tty.h | 5 -
 > >  1 file changed, 4 insertions(+), 1 deletion(-)
 > > 
 > > diff --git a/include/linux/tty.h b/include/linux/tty.h
 > > index 1c3316a..7cf61cb 100644
 > > --- a/include/linux/tty.h
 > > +++ b/include/linux/tty.h
 > > @@ -261,7 +261,10 @@ struct tty_struct {
 > >unsigned long flags;
 > >int count;
 > >struct winsize winsize; /* winsize_mutex */
 > > -  unsigned char stopped:1, hw_stopped:1, flow_stopped:1, packet:1;
 > > +  bool stopped;
 > > +  bool hw_stopped;
 > > +  bool flow_stopped;
 > > +  bool packet;
 > >unsigned char ctrl_status;  /* ctrl_lock */
 > >unsigned int receive_room;  /* Bytes free for queue */
 > >int flow_change;
 > 
 > 
 > --
 > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
 > the body of a message to majord...@vger.kernel.org
 > More majordomo info at  http://vger.kernel.org/majordomo-info.html
 > Please read the FAQ at  http://www.tux.org/lkml/

-- 
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: TTM placement & caching issue/questions

2014-09-04 Thread Thomas Hellstrom
Hi!

Let me try to bring some clarity and suggestions into this.

On 09/04/2014 02:12 AM, Benjamin Herrenschmidt wrote:
> Hi folks !
>
> I've been tracking down some problems with the recent DRI on powerpc and
> stumbled upon something that doesn't look right, and not necessarily
> only for us.
>
> Now it's possible that I haven't fully understood the code here and I
> also don't know to what extent some of that behaviour is necessary for
> some platforms such as Intel GTT bits.
>
> What I've observed with a simple/dumb (no DMA) driver like AST (but this
> probably happens more generally) is that when evicting a BO from VRAM
> into System memory, the TTM tries to preserve the existing caching
> attributes of the VRAM object.
>
> >From what I can tell, we end up with going from VRAM to System memory
> type, and we eventually call ttm_bo_select_caching() to select the
> caching option for the target.
>
> This will, from what I can tell, try to use the same caching mode as the
> original object:
>
>   if ((cur_placement & caching) != 0)
>   result |= (cur_placement & caching);
>
> And cur_placement comes from bo->mem.placement which as far as I can
> tell is based on the placement array which the drivers set up.

This originates from the fact that when evicting GTT memory, on x86 it's
unnecessary and undesirable to switch caching mode when going to system.

>
> Now they tend to uniformly setup the placement for System memory as
> TTM_PL_MASK_CACHING which enables all caching modes.
>
> So I end up with, for example, my System memory BOs having
> TTM_PL_FLAG_CACHED not set (though they also don't have
> TTM_PL_FLAG_UNCACHED) and TTM_PL_FLAG_WC.
>
> We don't seem to use the man->default_caching (which will have
> TTM_PL_FLAG_CACHED) unless there is no matching bit at all between the
> proposed placement and the existing caching mode.
>
> Now this is a problem for several reason that I can think of:
>
>  - On a number of powerpc platforms, such as all our server 64-bit one
> for example, it's actually illegal to map system memory non-cached. The
> system is fully cache coherent for all possible DMA originators (that we
> care about at least) and mapping memory non-cachable while it's mapped
> cachable in the linear mapping can cause nasty cache paradox which, when
> detected by HW, can checkstop the system.
>
>  - A similar issue exists, afaik, on ARM >= v7, so anything mapped
> non-cachable must be removed from the linear mapping explicitly since
> otherwise it can be speculatively prefetched into the cache.
>
>  - I don't know about x86, but even then, it looks quite sub-optimal to
> map the memory backing of the BOs and access it using a WC rather than a
> cachable mapping attribute.

Last time I tested, (and it seems like Michel is on the same track),
writing with the CPU to write-combined memory was substantially faster
than writing to cached memory, with the additional side-effect that CPU
caches are left unpolluted.

Moreover (although only tested on Intel's embedded chipsets), texturing
from cpu-cache-coherent PCI memory was a real GPU performance hog
compared to texturing from non-snooped memory. Hence, whenever a buffer
could be classified as GPU-read-only (or almost at least), it should be
placed in write-combined memory.

>
> Now, some folks on IRC mentioned that there might be reasons for the
> current behaviour as to not change the caching attributes when going
> in/out of the GTT on Intel, I don't know how that relates and how that
> works, but maybe that should be enforced by having a different placement
> mask specifically on those chipsets.
>
> Dave, should we change the various PCI drivers for generally coherent
> devices such that the System memory type doesn't allow placements
> without CACHED attribute ? Or at least on coherent platforms ? How do
> detect that ? Should we have a TTM helper to establish the default
> memory placement attributes that "normal PCI" drivers call to set that
> up so we can have all the necessary arch ifdefs in one single place, at
> least for "classic PCI/PCIe" stuff (AGP might need additional tweaks) ?
>
> Non-PCI and "special" drivers like Intel can use a different set of
> placement attributes to represent the requirements of those specific
> platforms (mostly thinking of embedded ARM here which under some
> circumstances might actually require non-cached mappings).
> Or am I missing another part of the puzzle ?
>
> As it-is, things are broken for me even for dumb drivers, and I suspect
> to a large extent with radeon and nouveau too, though in some case we
> might get away with it most of the time ... until the machine locks up
> for some unexplainable reason... This might cause problems on existing
> distros such as RHEL7 with our radeon adapters even.
>
> Any suggestion of what's the best approach to fix it ? I'm happy to
> produce the patches but I'm not that familiar with the TTM so I would
> like to make sure I'm the right track first :-)

Re: TTM placement & caching issue/questions

2014-09-04 Thread Thomas Hellstrom
On 09/04/2014 10:06 AM, Benjamin Herrenschmidt wrote:
> On Thu, 2014-09-04 at 09:44 +0200, Thomas Hellstrom wrote:
>
>>> This will, from what I can tell, try to use the same caching mode as the
>>> original object:
>>>
>>> if ((cur_placement & caching) != 0)
>>> result |= (cur_placement & caching);
>>>
>>> And cur_placement comes from bo->mem.placement which as far as I can
>>> tell is based on the placement array which the drivers set up.
>> This originates from the fact that when evicting GTT memory, on x86 it's
>> unnecessary and undesirable to switch caching mode when going to system.
> But that's what I don't quite understand. We have two different mappings
> here. The VRAM and the memory object. We wouldn't be "switching"... we
> are creating a temporary mapping for the memory object in order to do
> the memcpy, but we seem to be doing it by using the caching attributes
> of the VRAM object or am I missing something ? I don't see how that
> makes sense so I suppose I'm missing something here :-)

Well, the intention when TTM was written was that the driver writer
should be smart enough that when he wanted a move from unached VRAM to
system, he'd request cached system in the placement flags in the first
place.  If TTM somehow overrides such a request, that's a bug in TTM.

If the move, for example, is a result of an eviction, then the driver
evict_flags() function should ideally look at the current placement and
decide about a suitable placement based on that: vram-to-system moves
should generally request cacheable memory if the next access is expected
by the CPU. Probably write-combined otherwise.
If the move is the result of a TTM swapout, TTM will automatically
select cachable system, and for most other moves, I think the driver
writer is in full control.

>
>> Last time I tested, (and it seems like Michel is on the same track),
>> writing with the CPU to write-combined memory was substantially faster
>> than writing to cached memory, with the additional side-effect that CPU
>> caches are left unpolluted.
> That's very strange indeed. It's certainly an x86 specific artifact,
> even if we were allowed by our hypervisor to map memory non-cachable
> (the HW somewhat can), we tend to have a higher throughput by going
> cachable, but that could be due to the way the PowerBus works (it's
> basically very biased toward cachable transactions).
>
>> I dislike the approach of rewriting placements. In some cases I think it
>> won't even work, because placements are declared 'static const'
>>
>> What I'd suggest is instead to intercept the driver response from
>> init_mem_type() and filter out undesired caching modes from
>> available_caching and default_caching, 
> This was my original intent but Jerome seems to have different ideas
> (see his proposed patches). I'm happy to revive mine as well and post it
> as an alternative after I've tested it a bit more (tomorrow).
>
>> perhaps also looking at whether
>> the memory type is mappable or not. This should have the additional
>> benefit of working everywhere, and if a caching mode is selected that's
>> not available on the platform, you'll simply get an error. (I guess?)
> You mean that if not mappable we don't bother filtering ?
>
> The rule is really for me pretty simple:
>
>- If it's system memory (PL_SYSTEM/PL_TT), it MUST be cachable
>
>- If it's PCIe memory space (VRAM, registers, ...) it MUST be
> non-cachable.

Yes, something along these lines. I guess checking for VRAM or
TTM_MEMTYPE_FLAG_FIXED would perhaps do the trick

/Thomas

>
> Cheers,
> Ben.
>
>> /Thomas
>>
>>
>>> Cheers,
>>> Ben.
>>>
>>>
>>> ___
>>> dri-devel mailing list
>>> dri-de...@lists.freedesktop.org
>>> https://urldefense.proofpoint.com/v1/url?u=http://lists.freedesktop.org/mailman/listinfo/dri-devel&k=oIvRg1%2BdGAgOoM1BIlLLqw%3D%3D%0A&r=l5Ago9ekmVFZ3c4M6eauqrJWGwjf6fTb%2BP3CxbBFkVM%3D%0A&m=C9AHL1VngKBOxe2UrNP2eCZo6FLqdlr6Y90rpfE5rUs%3D%0A&s=73da0633bafc5d54bf116bc861d48d13c39cf8f41832adfb739709e98ec05553
>

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: bit fields && data tearing

2014-09-04 Thread Jakub Jelinek
On Thu, Sep 04, 2014 at 08:24:12AM -0400, Peter Hurley wrote:
> And I just confirmed with the Alpha cross-compiler that the fields are
> not 'padded out' if volatile either.

They can't be, struct layout is part of the ABI.
Guess you can introduce say atomic_bool and similar typedefs which would be
bool or char on most arches and on alpha int.

Jakub
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: bit fields && data tearing

2014-09-04 Thread Peter Hurley
On 09/04/2014 05:09 AM, Jakub Jelinek wrote:
> On Thu, Sep 04, 2014 at 10:57:40AM +0200, Mikael Pettersson wrote:
>> Benjamin Herrenschmidt writes:
>>  > On Wed, 2014-09-03 at 18:51 -0400, Peter Hurley wrote:
>>  > 
>>  > > Apologies for hijacking this thread but I need to extend this discussion
>>  > > somewhat regarding what a compiler might do with adjacent fields in a 
>> structure.
>>  > > 
>>  > > The tty subsystem defines a large aggregate structure, struct 
>> tty_struct.
>>  > > Importantly, several different locks apply to different fields within 
>> that
>>  > > structure; ie., a specific spinlock will be claimed before updating or 
>> accessing
>>  > > certain fields while a different spinlock will be claimed before 
>> updating or
>>  > > accessing certain _adjacent_ fields.
>>  > > 
>>  > > What is necessary and sufficient to prevent accidental false-sharing?
>>  > > The patch below was flagged as insufficient on ia64, and possibly ARM.
>>  > 
>>  > We expect native aligned scalar types to be accessed atomically (the
>>  > read/modify/write of a larger quantity that gcc does on some bitfield
>>  > cases has been flagged as a gcc bug, but shouldn't happen on normal
>>  > scalar types).
>>  > 
>>  > I am not 100% certain of "bool" here, I assume it's treated as a normal
>>  > scalar and thus atomic but if unsure, you can always use int.
>>
>> Please use an aligned int or long.  Some machines cannot do atomic
>> accesses on sub-int/long quantities, so 'bool' may cause unexpected
>> rmw cycles on adjacent fields.
> 
> Yeah, at least pre-EV56 Alpha performs rmw cycles on char/short accesses
> and thus those are not atomic.

Ok, thanks.

And I just confirmed with the Alpha cross-compiler that the fields are
not 'padded out' if volatile either.

Do any arches consider this an 'optimization'?  I ask because this kind of
accidental adjacency sharing may be common. Even RCU has a char field,
rcu_read_unlock_special, in the middle of the task_struct; luckily the
adjacent field is a list_head.

Besides updating the documentation, it may make sense to do something
arch-specific. Just bumping out storage on arches that don't need it
seems wasteful, as does generating bus locks on arches that don't need it.
Unfortunately, the code churn looks unavoidable.

Regards,
Peter Hurley
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2] iommu/fsl: Fix warning resulting from adding PCI device twice

2014-09-04 Thread Varun Sethi
iommu_group_get_for_dev determines the iommu group for the PCI device and adds
the device to the group.

In the PAMU driver we were again adding the device to the same group without 
checking
if the device already had an iommu group. This resulted in the following 
warning.

sysfs: cannot create duplicate filename 
'/devices/ffe20.pcie/pci:00/:00:00.0/iommu_group'
[ cut here ]
WARNING: at fs/sysfs/dir.c:31
Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.17.0-rc3-2-g7505cea-dirty #126
task: c001fe0a ti: c001fe044000 task.ti: c001fe044000
NIP: c018879c LR: c0188798 CTR: c001ea50
REGS: c001fe047040 TRAP: 0700   Not tainted  
(3.17.0-rc3-2-g7505cea-dirty)
MSR: 80029000   CR: 24ad8e22  XER: 2000
SOFTE: 1 
GPR00: c0188798 c001fe0472c0 c09a52e0 0065 
GPR04: 0001  3a30303a 2700 
GPR08: 2f696f6d c08d3830 c09b3938 c09bb3d0 
GPR12: 28ad8e24 cfff4000 c000205c  
GPR16:     
GPR20:    c08a4c70 
GPR24: c07e9010 c001fe0140a8 ffef 0001 
GPR28: c001fe22ebb8 c07e9010 c090bf10 c001fe22 
NIP [c018879c] .sysfs_warn_dup+0x74/0xa4
LR [c0188798] .sysfs_warn_dup+0x70/0xa4
Call Trace:
[c001fe0472c0] [c0188798] .sysfs_warn_dup+0x70/0xa4 (unreliable)
[c001fe047350] [c0188d34] 
.sysfs_do_create_link_sd.clone.2+0x168/0x174
[c001fe047400] [c04b3cf8] .iommu_group_add_device+0x78/0x244
[c001fe0474b0] [c04b6964] .fsl_pamu_add_device+0x88/0x1a8
[c001fe047570] [c04b3960] .iommu_bus_notifier+0xdc/0x15c
[c001fe047600] [c0059848] .notifier_call_chain+0x8c/0xe8
[c001fe0476a0] [c0059d04] .__blocking_notifier_call_chain+0x58/0x84
[c001fe047750] [c036619c] .device_add+0x464/0x5c8
[c001fe047820] [c0300ebc] .pci_device_add+0x14c/0x17c
[c001fe0478c0] [c0300fbc] .pci_scan_single_device+0xd0/0xf4
[c001fe047970] [c030104c] .pci_scan_slot+0x6c/0x18c
[c001fe047a10] [c030226c] .pci_scan_child_bus+0x40/0x114
[c001fe047ac0] [c0021974] .pcibios_scan_phb+0x240/0x2c8
[c001fe047b70] [c085a970] .pcibios_init+0x64/0xc8
[c001fe047c00] [c0001884] .do_one_initcall+0xbc/0x224
[c001fe047d00] [c0852d50] .kernel_init_freeable+0x14c/0x21c
[c001fe047db0] [c0002078] .kernel_init+0x1c/0xfa4
[c001fe047e30] [c884] .ret_from_kernel_thread+0x58/0xd4
Instruction dump:
7c7f1b79 4182001c 7fe4fb78 7f83e378 38a01000 4bffc905 6000 7c641b78 
e87e8008 7fa5eb78 48482ff5 6000 <0fe0> 7fe3fb78 4bf7bd39 6000 



Signed-off-by: Varun Sethi 
---
v2 changes
- directly check for the device iommu_group

 drivers/iommu/fsl_pamu_domain.c |   10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
index 61d1daf..56feed7 100644
--- a/drivers/iommu/fsl_pamu_domain.c
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -984,7 +984,7 @@ static int fsl_pamu_add_device(struct device *dev)
struct iommu_group *group = ERR_PTR(-ENODEV);
struct pci_dev *pdev;
const u32 *prop;
-   int ret, len;
+   int ret = 0, len;
 
/*
 * For platform devices we allocate a separate group for
@@ -1007,7 +1007,13 @@ static int fsl_pamu_add_device(struct device *dev)
if (IS_ERR(group))
return PTR_ERR(group);
 
-   ret = iommu_group_add_device(group, dev);
+   /*
+* Check if device has already been added to an iommu group.
+* Group could have already been created for a PCI device in
+* the iommu_group_get_for_dev path.
+*/
+   if (!dev->iommu_group)
+   ret = iommu_group_add_device(group, dev);
 
iommu_group_put(group);
return ret;
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

RE: [PATCH] iommu/fsl: Fix warning resulting from adding PCI device twice

2014-09-04 Thread Varun Sethi


> -Original Message-
> From: Joerg Roedel [mailto:j...@8bytes.org]
> Sent: Thursday, September 04, 2014 3:44 PM
> To: Sethi Varun-B16395
> Cc: io...@lists.linux-foundation.org; alex.william...@redhat.com; Medve
> Emilian-EMMEDVE1; linuxppc-...@ozlabs.org; linux-ker...@vger.kernel.org
> Subject: Re: [PATCH] iommu/fsl: Fix warning resulting from adding PCI device
> twice
> 
> On Thu, Sep 04, 2014 at 11:33:42AM +0530, Varun Sethi wrote:
> > +   if (!iommu_group_get(dev))
> > +   ret = iommu_group_add_device(group, dev);
> >
> > iommu_group_put(group);
> > return ret;
> 
> Doesn't this additional call to iommu_group_get take a reference to the
> iommu_group that needs to be released?
Yes, you are right this would take a reference to the group that should be 
released. I need to add iommu_group_put for decrementing the reference count. 
Will post a new patch.

Thanks,
Varun


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] iommu/fsl: Fix warning resulting from adding PCI device twice

2014-09-04 Thread Joerg Roedel
On Thu, Sep 04, 2014 at 11:33:42AM +0530, Varun Sethi wrote:
> + if (!iommu_group_get(dev))
> + ret = iommu_group_add_device(group, dev);
>  
>   iommu_group_put(group);
>   return ret;

Doesn't this additional call to iommu_group_get take a reference to the
iommu_group that needs to be released?


Joerg

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: bit fields && data tearing

2014-09-04 Thread Benjamin Herrenschmidt
On Thu, 2014-09-04 at 08:43 +, David Laight wrote:
> From:  Benjamin Herrenschmidt
> > On Wed, 2014-09-03 at 18:51 -0400, Peter Hurley wrote:
> > 
> > > Apologies for hijacking this thread but I need to extend this discussion
> > > somewhat regarding what a compiler might do with adjacent fields in a 
> > > structure.
> > >
> > > The tty subsystem defines a large aggregate structure, struct tty_struct.
> > > Importantly, several different locks apply to different fields within that
> > > structure; ie., a specific spinlock will be claimed before updating or 
> > > accessing
> > > certain fields while a different spinlock will be claimed before updating 
> > > or
> > > accessing certain _adjacent_ fields.
> > >
> > > What is necessary and sufficient to prevent accidental false-sharing?
> > > The patch below was flagged as insufficient on ia64, and possibly ARM.
> > 
> > We expect native aligned scalar types to be accessed atomically (the
> > read/modify/write of a larger quantity that gcc does on some bitfield
> > cases has been flagged as a gcc bug, but shouldn't happen on normal
> > scalar types).
> 
> That isn't true on all architectures for items smaller than a machine word.
> At least one has to do rmw for byte accesses.

Yeah correct, alpha and bytes right ? Is there any other ? That's why I
suggested int.

>   David
> 
> > I am not 100% certain of "bool" here, I assume it's treated as a normal
> > scalar and thus atomic but if unsure, you can always use int.
> > 
> > Another option is to use the atomic bitops and make these bits in a
> > bitmask but that is probably unnecessary if you have locks already.
> > 
> > Cheers,
> > Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: TTM placement & caching issue/questions

2014-09-04 Thread Benjamin Herrenschmidt
On Thu, 2014-09-04 at 11:34 +0200, Daniel Vetter wrote:
> On Thu, Sep 04, 2014 at 09:44:04AM +0200, Thomas Hellstrom wrote:
> > Last time I tested, (and it seems like Michel is on the same track),
> > writing with the CPU to write-combined memory was substantially faster
> > than writing to cached memory, with the additional side-effect that CPU
> > caches are left unpolluted.
> > 
> > Moreover (although only tested on Intel's embedded chipsets), texturing
> > from cpu-cache-coherent PCI memory was a real GPU performance hog
> > compared to texturing from non-snooped memory. Hence, whenever a buffer
> > could be classified as GPU-read-only (or almost at least), it should be
> > placed in write-combined memory.
> 
> Just a quick comment since this explicitly referes to intel chips: On
> desktop/laptop chips with the big shared l3/l4 caches it's the other way
> round. Cached uploads are substantially faster than wc and not using
> coherent access is a severe perf hit for texturing. I guess the hw guys
> worked really hard to hide the snooping costs so that the gpu can benefit
> from the massive bandwidth these caches can provide.

This is similar to modern POWER chips as well. We have pretty big L3's
(though not technically shared they are in a separate quadrant and we
have a shared L4 in the memory buffer) and our fabric is generally
optimized for cachable/coherent access performance. In fact, we only
have so many credits for NC accesses on the bus...

What that tells me is that when setting up the desired cachability
attributes for the mapping of a memory object, we need to consider these
things here:

  - The hard requirement of the HW (non-coherent GPUs require NC, AGP
does in some cases, etc...) which I think is basically already handled
using the placement attributes set by the GPU driver for the memory type

  - The optimal attributes (and platform hard requirements) for fast
memory accesses to an object by the processor.  From what I read here,
this can be NC+WC on older Intel, cachable on newer, etc...)

  - The optimal attributes for fast GPU DMA accesses to the object in
system memory. Here too, this is fairly platform/chipset dependent.

Do we have flags in the DRM that tell us whether an object in memory is
more likely to be used by the GPU via DMA vs by the CPU via MMIO ? On
powerpc (except in the old AGP case), I wouldn't care about require
cachable in both case, but I can see the low latency crowd wanting the
former to be non-cachable while the dumb GPUs like AST who don't do DMA
would benefit greatly from the latter...

Cheers,
Ben.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: bit fields && data tearing

2014-09-04 Thread Jakub Jelinek
On Thu, Sep 04, 2014 at 10:57:40AM +0200, Mikael Pettersson wrote:
> Benjamin Herrenschmidt writes:
>  > On Wed, 2014-09-03 at 18:51 -0400, Peter Hurley wrote:
>  > 
>  > > Apologies for hijacking this thread but I need to extend this discussion
>  > > somewhat regarding what a compiler might do with adjacent fields in a 
> structure.
>  > > 
>  > > The tty subsystem defines a large aggregate structure, struct tty_struct.
>  > > Importantly, several different locks apply to different fields within 
> that
>  > > structure; ie., a specific spinlock will be claimed before updating or 
> accessing
>  > > certain fields while a different spinlock will be claimed before 
> updating or
>  > > accessing certain _adjacent_ fields.
>  > > 
>  > > What is necessary and sufficient to prevent accidental false-sharing?
>  > > The patch below was flagged as insufficient on ia64, and possibly ARM.
>  > 
>  > We expect native aligned scalar types to be accessed atomically (the
>  > read/modify/write of a larger quantity that gcc does on some bitfield
>  > cases has been flagged as a gcc bug, but shouldn't happen on normal
>  > scalar types).
>  > 
>  > I am not 100% certain of "bool" here, I assume it's treated as a normal
>  > scalar and thus atomic but if unsure, you can always use int.
> 
> Please use an aligned int or long.  Some machines cannot do atomic
> accesses on sub-int/long quantities, so 'bool' may cause unexpected
> rmw cycles on adjacent fields.

Yeah, at least pre-EV56 Alpha performs rmw cycles on char/short accesses
and thus those are not atomic.

Jakub
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

RE: bit fields && data tearing

2014-09-04 Thread David Laight
From:  Benjamin Herrenschmidt
> On Wed, 2014-09-03 at 18:51 -0400, Peter Hurley wrote:
> 
> > Apologies for hijacking this thread but I need to extend this discussion
> > somewhat regarding what a compiler might do with adjacent fields in a 
> > structure.
> >
> > The tty subsystem defines a large aggregate structure, struct tty_struct.
> > Importantly, several different locks apply to different fields within that
> > structure; ie., a specific spinlock will be claimed before updating or 
> > accessing
> > certain fields while a different spinlock will be claimed before updating or
> > accessing certain _adjacent_ fields.
> >
> > What is necessary and sufficient to prevent accidental false-sharing?
> > The patch below was flagged as insufficient on ia64, and possibly ARM.
> 
> We expect native aligned scalar types to be accessed atomically (the
> read/modify/write of a larger quantity that gcc does on some bitfield
> cases has been flagged as a gcc bug, but shouldn't happen on normal
> scalar types).

That isn't true on all architectures for items smaller than a machine word.
At least one has to do rmw for byte accesses.

David

> I am not 100% certain of "bool" here, I assume it's treated as a normal
> scalar and thus atomic but if unsure, you can always use int.
> 
> Another option is to use the atomic bitops and make these bits in a
> bitmask but that is probably unnecessary if you have locks already.
> 
> Cheers,
> Ben.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: TTM placement & caching issue/questions

2014-09-04 Thread Benjamin Herrenschmidt
On Thu, 2014-09-04 at 16:59 +0900, Michel Dänzer wrote:
> 
> Define 'not reliably'. I have uptimes of weeks, and I'm pretty sure I'm 
> not alone, at least with AGP 1x it seems to work quite well for most 
> people. So I don't see the justification for intentionally breaking it 
> completely for all of us.

Oh I wasn't arguing for breaking it, just jesting. We need to keep it
working. It's amazing how well broken stuff actually work though :-)

I mean, it's architecturally broken and if we get a collision between
the cache and the NCU, the chip will crash. We just get lucky I suppose.

Anyway, I'll try a different approach tomorrow see how it goes.

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: TTM placement & caching issue/questions

2014-09-04 Thread Benjamin Herrenschmidt
On Thu, 2014-09-04 at 09:44 +0200, Thomas Hellstrom wrote:

> > This will, from what I can tell, try to use the same caching mode as the
> > original object:
> >
> > if ((cur_placement & caching) != 0)
> > result |= (cur_placement & caching);
> >
> > And cur_placement comes from bo->mem.placement which as far as I can
> > tell is based on the placement array which the drivers set up.
> 
> This originates from the fact that when evicting GTT memory, on x86 it's
> unnecessary and undesirable to switch caching mode when going to system.

But that's what I don't quite understand. We have two different mappings
here. The VRAM and the memory object. We wouldn't be "switching"... we
are creating a temporary mapping for the memory object in order to do
the memcpy, but we seem to be doing it by using the caching attributes
of the VRAM object or am I missing something ? I don't see how that
makes sense so I suppose I'm missing something here :-)

> Last time I tested, (and it seems like Michel is on the same track),
> writing with the CPU to write-combined memory was substantially faster
> than writing to cached memory, with the additional side-effect that CPU
> caches are left unpolluted.

That's very strange indeed. It's certainly an x86 specific artifact,
even if we were allowed by our hypervisor to map memory non-cachable
(the HW somewhat can), we tend to have a higher throughput by going
cachable, but that could be due to the way the PowerBus works (it's
basically very biased toward cachable transactions).

> I dislike the approach of rewriting placements. In some cases I think it
> won't even work, because placements are declared 'static const'
> 
> What I'd suggest is instead to intercept the driver response from
> init_mem_type() and filter out undesired caching modes from
> available_caching and default_caching, 

This was my original intent but Jerome seems to have different ideas
(see his proposed patches). I'm happy to revive mine as well and post it
as an alternative after I've tested it a bit more (tomorrow).

> perhaps also looking at whether
> the memory type is mappable or not. This should have the additional
> benefit of working everywhere, and if a caching mode is selected that's
> not available on the platform, you'll simply get an error. (I guess?)

You mean that if not mappable we don't bother filtering ?

The rule is really for me pretty simple:

   - If it's system memory (PL_SYSTEM/PL_TT), it MUST be cachable

   - If it's PCIe memory space (VRAM, registers, ...) it MUST be
non-cachable.

Cheers,
Ben.

> /Thomas
> 
> 
> >
> > Cheers,
> > Ben.
> >
> >
> > ___
> > dri-devel mailing list
> > dri-de...@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/dri-devel


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: TTM placement & caching issue/questions

2014-09-04 Thread Michel Dänzer

On 04.09.2014 16:59, Michel Dänzer wrote:

On 04.09.2014 16:54, Benjamin Herrenschmidt wrote:

On Thu, 2014-09-04 at 16:19 +0900, Michel Dänzer wrote:

+#else /* CONFIG_X86 */
+int ttm_tt_set_placement_caching(struct ttm_tt *ttm, uint32_t

*placement)

+{
+ if (*placement & (TTM_PL_TT | TTM_PL_FLAG_SYSTEM)) {
+ ttm->caching_state = tt_cached;
+ *placement &= ~TTM_PL_MASK_CACHING;
+ *placement |= TTM_PL_FLAG_CACHED;


NAK, this will break AGP on PowerMacs.


  ... which doesn't work reliably anyway with DRI2 :-)


Define 'not reliably'. I have uptimes of weeks, and I'm pretty sure I'm
not alone, at least with AGP 1x it seems to work quite well for most
people. So I don't see the justification for intentionally breaking it
completely for all of us.


Even more so because PCI GART is unusably slow in general.


--
Earthling Michel Dänzer|  http://www.amd.com
Libre software enthusiast  |Mesa and X developer
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: TTM placement & caching issue/questions

2014-09-04 Thread Michel Dänzer

On 04.09.2014 16:54, Benjamin Herrenschmidt wrote:

On Thu, 2014-09-04 at 16:19 +0900, Michel Dänzer wrote:

+#else /* CONFIG_X86 */
+int ttm_tt_set_placement_caching(struct ttm_tt *ttm, uint32_t

*placement)

+{
+ if (*placement & (TTM_PL_TT | TTM_PL_FLAG_SYSTEM)) {
+ ttm->caching_state = tt_cached;
+ *placement &= ~TTM_PL_MASK_CACHING;
+ *placement |= TTM_PL_FLAG_CACHED;


NAK, this will break AGP on PowerMacs.


  ... which doesn't work reliably anyway with DRI2 :-)


Define 'not reliably'. I have uptimes of weeks, and I'm pretty sure I'm 
not alone, at least with AGP 1x it seems to work quite well for most 
people. So I don't see the justification for intentionally breaking it 
completely for all of us.



--
Earthling Michel Dänzer|  http://www.amd.com
Libre software enthusiast  |Mesa and X developer
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: TTM placement & caching issue/questions

2014-09-04 Thread Benjamin Herrenschmidt
On Thu, 2014-09-04 at 16:19 +0900, Michel Dänzer wrote:
> > +#else /* CONFIG_X86 */
> > +int ttm_tt_set_placement_caching(struct ttm_tt *ttm, uint32_t
> *placement)
> > +{
> > + if (*placement & (TTM_PL_TT | TTM_PL_FLAG_SYSTEM)) {
> > + ttm->caching_state = tt_cached;
> > + *placement &= ~TTM_PL_MASK_CACHING;
> > + *placement |= TTM_PL_FLAG_CACHED;
> 
> NAK, this will break AGP on PowerMacs.

 ... which doesn't work reliably anyway with DRI2 :-)

The problem is ... with DRI1 I think we had tricks to take out the
AGP from the linear mapping but that want away, didn't we ?

In any case, we are playing with fire on these by allowing the
cache paradox. It just happens that those old CPUs aren't *that*
aggressive at speculative prefetch and we probably rarely hit the
lockups that they would cause...

Michel, what do you recommend we do then ? The patch I sent to
double check in ttm_io_prot() has a specific hack to avoid warning
on PowerMac for the above reason, but we need to fix Jerome if we
want to keep that broken-by-design Mac AGP functionality going :-)

Maybe we could add a similar ifdef in the above ?

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: TTM placement & caching issue/questions

2014-09-04 Thread Michel Dänzer

On 04.09.2014 10:55, Jerome Glisse wrote:


While i agree about the issue of incoherent double map of same page, i
think we have more issue. For instance lattely AMD have been pushing a
lot of patches to move things to use uncached memory for radeon and as
usual thoses patches comes with no comment to the motivations of those
changes.


That would have been a fair review comment...



What i understand is that uncached mapping for some frequently use buffer
give a significant performance boost (i am assuming this has to do with
all the snoop pci transaction overhead).


Exactly, although it's a win even if the data is written by the CPU only 
once and read by the GPU only once.




This also means that we need to fix ttm_tt_set_placement_caching so that
when it returns an error it switches to cached mapping. Which will always
work.


GTT with AGP being one exception.


--
Earthling Michel Dänzer|  http://www.amd.com
Libre software enthusiast  |Mesa and X developer
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: TTM placement & caching issue/questions

2014-09-04 Thread Michel Dänzer

On 04.09.2014 11:36, Jerome Glisse wrote:

On Wed, Sep 03, 2014 at 10:31:18PM -0400, Jerome Glisse wrote:

On Thu, Sep 04, 2014 at 12:25:23PM +1000, Benjamin Herrenschmidt wrote:

On Wed, 2014-09-03 at 22:07 -0400, Jerome Glisse wrote:


So in the meantime the attached patch should work, it just silently ignore
the caching attribute request on non x86 instead of pretending that things
are setup as expected and then latter the radeon ou nouveau hw unsetting
the snoop bit.

It's not tested but i think it should work.


I'm still getting placements with !CACHED going from bo_memcpy in
ttm_io_prot() though ... I'm looking at filtering the placement
attributes instead.

Ben.


Ok so this one should do the trick.


Ok final version ... famous last word.


[...]


+#else /* CONFIG_X86 */
+int ttm_tt_set_placement_caching(struct ttm_tt *ttm, uint32_t *placement)
+{
+   if (*placement & (TTM_PL_TT | TTM_PL_FLAG_SYSTEM)) {
+   ttm->caching_state = tt_cached;
+   *placement &= ~TTM_PL_MASK_CACHING;
+   *placement |= TTM_PL_FLAG_CACHED;


NAK, this will break AGP on PowerMacs.


--
Earthling Michel Dänzer|  http://www.amd.com
Libre software enthusiast  |Mesa and X developer
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: TTM placement & caching issue/questions

2014-09-04 Thread Gabriel Paubert
On Wed, Sep 03, 2014 at 10:36:57PM -0400, Jerome Glisse wrote:
> On Wed, Sep 03, 2014 at 10:31:18PM -0400, Jerome Glisse wrote:
> > On Thu, Sep 04, 2014 at 12:25:23PM +1000, Benjamin Herrenschmidt wrote:
> > > On Wed, 2014-09-03 at 22:07 -0400, Jerome Glisse wrote:
> > > 
> > > > So in the meantime the attached patch should work, it just silently 
> > > > ignore
> > > > the caching attribute request on non x86 instead of pretending that 
> > > > things
> > > > are setup as expected and then latter the radeon ou nouveau hw unsetting
> > > > the snoop bit.
> > > > 
> > > > It's not tested but i think it should work.
> > > 
> > > I'm still getting placements with !CACHED going from bo_memcpy in
> > > ttm_io_prot() though ... I'm looking at filtering the placement
> > > attributes instead.
> > > 
> > > Ben.
> > 
> > Ok so this one should do the trick.
> 
> Ok final version ... famous last word.
[snipped older version]
> >From 236038e18dc303bb9aa877922e01963d3fb0b7af Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= 
> Date: Wed, 3 Sep 2014 22:04:34 -0400
> Subject: [PATCH] drm/ttm: force cached mapping on non x86 platform.
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
> 
> People interested in providing uncached or write combined mapping
> on there architecture need to do the ground work inside there arch

s/there/their/g

> specific code to allow to break the linear kernel mapping so that
> page mapping attributes can be updated, in the meantime force cached
> mapping for non x86 architecture.
> 
> Signed-off-by: Jérôme Glisse 
> ---
>  drivers/gpu/drm/radeon/radeon_ttm.c |  2 +-
>  drivers/gpu/drm/ttm/ttm_bo.c|  2 +-
>  drivers/gpu/drm/ttm/ttm_tt.c| 32 +---
>  include/drm/ttm/ttm_bo_driver.h |  2 +-
>  4 files changed, 24 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c 
> b/drivers/gpu/drm/radeon/radeon_ttm.c
> index 72afe82..4dd5060 100644
> --- a/drivers/gpu/drm/radeon/radeon_ttm.c
> +++ b/drivers/gpu/drm/radeon/radeon_ttm.c
> @@ -304,7 +304,7 @@ static int radeon_move_vram_ram(struct ttm_buffer_object 
> *bo,
>   return r;
>   }
>  
> - r = ttm_tt_set_placement_caching(bo->ttm, tmp_mem.placement);
> + r = ttm_tt_set_placement_caching(bo->ttm, &tmp_mem.placement);
>   if (unlikely(r)) {
>   goto out_cleanup;
>   }
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 3da89d5..4dc21c3 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -305,7 +305,7 @@ static int ttm_bo_handle_move_mem(struct 
> ttm_buffer_object *bo,
>   goto out_err;
>   }
>  
> - ret = ttm_tt_set_placement_caching(bo->ttm, mem->placement);
> + ret = ttm_tt_set_placement_caching(bo->ttm, &mem->placement);
>   if (ret)
>   goto out_err;
>  
> diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
> index bf080ab..a0df803 100644
> --- a/drivers/gpu/drm/ttm/ttm_tt.c
> +++ b/drivers/gpu/drm/ttm/ttm_tt.c
> @@ -89,14 +89,6 @@ static inline int ttm_tt_set_page_caching(struct page *p,
>  
>   return ret;
>  }
> -#else /* CONFIG_X86 */
> -static inline int ttm_tt_set_page_caching(struct page *p,
> -   enum ttm_caching_state c_old,
> -   enum ttm_caching_state c_new)
> -{
> - return 0;
> -}
> -#endif /* CONFIG_X86 */
>  
>  /*
>   * Change caching policy for the linear kernel map
> @@ -149,19 +141,37 @@ out_err:
>   return ret;
>  }
>  
> -int ttm_tt_set_placement_caching(struct ttm_tt *ttm, uint32_t placement)
> +int ttm_tt_set_placement_caching(struct ttm_tt *ttm, uint32_t *placement)
>  {
>   enum ttm_caching_state state;
>  
> - if (placement & TTM_PL_FLAG_WC)
> + if (*placement & TTM_PL_FLAG_WC)
>   state = tt_wc;
> - else if (placement & TTM_PL_FLAG_UNCACHED)
> + else if (*placement & TTM_PL_FLAG_UNCACHED)
>   state = tt_uncached;
>   else
>   state = tt_cached;
>  
>   return ttm_tt_set_caching(ttm, state);
>  }
> +#else /* CONFIG_X86 */
> +int ttm_tt_set_placement_caching(struct ttm_tt *ttm, uint32_t *placement)
> +{
> + if (*placement & (TTM_PL_TT | TTM_PL_FLAG_SYSTEM)) {
> + ttm->caching_state = tt_cached;
> + *placement &= ~TTM_PL_MASK_CACHING;
> + *placement |= TTM_PL_FLAG_CACHED;
> + } else {
> + if (*placement & TTM_PL_FLAG_WC)
> + ttm->caching_state = tt_wc;
> + else if (placement & TTM_PL_FLAG_UNCACHED)
> + ttm->caching_state = tt_uncached;
> + else
> + ttm->caching_state = tt_cached;
> + }
> + return 0;
> +}
> +#endif /* CONFIG_X86 */
>  EXPORT_SYMBOL(ttm_tt_set_placement