from:"Sonny Rao"

[PATCH v2] vhost: add vsock compat ioctl

2018-03-14 Thread Sonny Rao

This will allow usage of vsock from 32-bit binaries on a 64-bit
kernel.

Signed-off-by: Sonny Rao 
---
 drivers/vhost/vsock.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 0d14e2ff19f16..ee0c385d9fe54 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -699,12 +699,23 @@ static long vhost_vsock_dev_ioctl(struct file *f, 
unsigned int ioctl,
}
 }
 
+#ifdef CONFIG_COMPAT
+static long vhost_vsock_dev_compat_ioctl(struct file *f, unsigned int ioctl,
+unsigned long arg)
+{
+   return vhost_vsock_dev_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
+}
+#endif
+
 static const struct file_operations vhost_vsock_fops = {
.owner  = THIS_MODULE,
.open   = vhost_vsock_dev_open,
.release= vhost_vsock_dev_release,
.llseek = noop_llseek,
.unlocked_ioctl = vhost_vsock_dev_ioctl,
+#ifdef CONFIG_COMPAT
+   .compat_ioctl   = vhost_vsock_dev_compat_ioctl,
+#endif
 };
 
 static struct miscdevice vhost_vsock_misc = {
-- 
2.13.5

Re: [PATCH] vhost: add vsock compat ioctl

2018-03-14 Thread Sonny Rao

On Wed, Mar 14, 2018 at 12:05 PM, Michael S. Tsirkin  wrote:
> On Wed, Mar 14, 2018 at 10:26:05AM -0700, Sonny Rao wrote:
>> This will allow usage of vsock from 32-bit binaries on a 64-bit
>> kernel.
>>
>> Signed-off-by: Sonny Rao 
>
> I think you need to convert the pointer argument though.
> Something along the lines of:
>
> #ifdef CONFIG_COMPAT
> static long vhost_vsock_dev_compat_ioctl(struct file *f, unsigned int ioctl,
>  unsigned long arg)
> {
> return vhost_vsock_dev_ioctl(f, ioctl, (unsigned 
> long)compat_ptr(arg));
> }
> #endif

Ok, thanks for pointing that out -- it has worked for me so far, but
I'll re-spin as you suggested.

>
>
>
>> ---
>>  drivers/vhost/vsock.c | 1 +
>>  1 file changed, 1 insertion(+)
>>
>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>> index 0d14e2ff19f16..d0e65e92110e5 100644
>> --- a/drivers/vhost/vsock.c
>> +++ b/drivers/vhost/vsock.c
>> @@ -705,6 +705,7 @@ static const struct file_operations vhost_vsock_fops = {
>>   .release= vhost_vsock_dev_release,
>>   .llseek = noop_llseek,
>>   .unlocked_ioctl = vhost_vsock_dev_ioctl,
>> + .compat_ioctl   = vhost_vsock_dev_ioctl,
>>  };
>>
>>  static struct miscdevice vhost_vsock_misc = {
>> --
>> 2.13.5

[PATCH] vhost: add vsock compat ioctl

2018-03-14 Thread Sonny Rao

This will allow usage of vsock from 32-bit binaries on a 64-bit
kernel.

Signed-off-by: Sonny Rao 
---
 drivers/vhost/vsock.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 0d14e2ff19f16..d0e65e92110e5 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -705,6 +705,7 @@ static const struct file_operations vhost_vsock_fops = {
.release= vhost_vsock_dev_release,
.llseek = noop_llseek,
.unlocked_ioctl = vhost_vsock_dev_ioctl,
+   .compat_ioctl   = vhost_vsock_dev_ioctl,
 };
 
 static struct miscdevice vhost_vsock_misc = {
-- 
2.13.5

[PATCH] vhost: fix vhost ioctl signature to build with clang

2018-03-14 Thread Sonny Rao

Clang is particularly anal about signed vs unsigned comparisons and
doesn't like the fact that some ioctl numbers set the MSB, so we get
this error when trying to build vhost on aarch64:

drivers/vhost/vhost.c:1400:7: error: overflow converting case value to
 switch condition type (3221794578 to 18446744072636378898)
 [-Werror, -Wswitch]
case VHOST_GET_VRING_BASE:

3221794578 is 0xC008AF12 in hex
18446744072636378898 is 0xC008AF12 in hex

Fix this by using unsigned ints in the function signature for
vhost_vring_ioctl().

Signed-off-by: Sonny Rao 
---
 drivers/vhost/vhost.c | 2 +-
 drivers/vhost/vhost.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1b3e8d2d5c8b4..5316319d84081 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1337,7 +1337,7 @@ static long vhost_set_memory(struct vhost_dev *d, struct 
vhost_memory __user *m)
return -EFAULT;
 }
 
-long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
+long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp)
 {
struct file *eventfp, *filep = NULL;
bool pollstart = false, pollstop = false;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index ac4b6056f19ae..d8ee85ae8fdcc 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -45,7 +45,7 @@ void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
 void vhost_poll_queue(struct vhost_poll *poll);
 void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work);
-long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp);
+long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp);
 
 struct vhost_log {
u64 addr;
@@ -177,7 +177,7 @@ void vhost_dev_reset_owner(struct vhost_dev *, struct 
vhost_umem *);
 void vhost_dev_cleanup(struct vhost_dev *);
 void vhost_dev_stop(struct vhost_dev *);
 long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user 
*argp);
-long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp);
+long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp);
 int vhost_vq_access_ok(struct vhost_virtqueue *vq);
 int vhost_log_access_ok(struct vhost_dev *);
 
-- 
2.13.5

Re: [PATCH RFC v2] Add /proc/pid/smaps_rollup

2017-08-10 Thread Sonny Rao

On Thu, Aug 10, 2017 at 3:58 AM, Michal Hocko  wrote:
> On Thu 10-08-17 03:23:23, Daniel Colascione wrote:
>> Thanks for taking a look at the patch!
>>
>> On Thu, Aug 10 2017, Michal Hocko wrote:
>> > [CC linux-api - the patch was posted here
>> > http://lkml.kernel.org/r/20170810001557.147285-1-dan...@google.com]
>> >
>> > On Thu 10-08-17 13:38:31, Minchan Kim wrote:
>> >> On Wed, Aug 09, 2017 at 05:15:57PM -0700, Daniel Colascione wrote:
>> >> > /proc/pid/smaps_rollup is a new proc file that improves the
>> >> > performance of user programs that determine aggregate memory
>> >> > statistics (e.g., total PSS) of a process.
>> >> >
>> >> > Android regularly "samples" the memory usage of various processes in
>> >> > order to balance its memory pool sizes. This sampling process involves
>> >> > opening /proc/pid/smaps and summing certain fields. For very large
>> >> > processes, sampling memory use this way can take several hundred
>> >> > milliseconds, due mostly to the overhead of the seq_printf calls in
>> >> > task_mmu.c.
>> >
>> > Have you tried to reduce that overhead? E.g. by replacing seq_printf by
>> > something more simple
>> > http://lkml.kernel.org/r/20160817130320.gc20...@dhcp22.suse.cz?
>>
>> I haven't tried that yet, but if I'm reading that thread correctly, it
>> looks like using more efficient printing primitives gives us a 7%
>> speedup. The smaps_rollup patch gives us a much bigger speedup while
>> reusing almost all the smaps code, so it seems easier and simpler than a
>> bunch of incremental improvements to smaps. And even an efficient smaps
>> would have to push 2MB through seq_file for the 3000-VMA process case.
>
> The thing is that more users would benefit from a more efficient
> /proc/pid/smaps call. Maybe we can use some caching tricks etc...  We
> should make sure that existing options should be attempted before a new
> user visible interface is added. It is kind of sad that the real work
> (pte walk) is less expensive than formating the output and copying it to
> the userspace...
>
>> > How often you you need to read this information?
>>
>> It varies depending on how often processes change state.  We sample a
>> short time (tens of seconds) after processes change state (e.g., enters
>> foreground) and every few minutes thereafter. We're particularly
>> concerned from an energy perspective about needlessly burning CPU on
>> background samples.
>
> Please make sure this is documented in the patch along with some numbers
> ideally.
>
> [...]
>
>> >> FYI, there was trial but got failed at that time so in this time,
>> >> https://marc.info/?l=linux-kernel&m=147310650003277&w=2
>> >> http://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1229163.html
>> >
>> > Yes I really disliked the previous attempt and this one is not all that
>> > better. The primary unanswered question back then was a relevant
>> > usecase. Back then it was argued [1] that PSS was useful for userspace
>> > OOM handling but arguments were rather dubious. Follow up questions [2]
>> > shown that the useage of PSS was very workload specific. Minchan has
>> > noted some usecase as well but not very specific either.
>>
>> Anyway, I see what you mean about PSS being iffy for user-space OOM
>> processing (because PSS doesn't tell you how much memory you get back in
>> exchange for killing a given process at a particular moment). We're not
>> using it like that.
>>
>> Instead, we're using the PSS samples we collect asynchronously for
>> system-management tasks like fine-tuning oom_adj_score, memory use
>> tracking for debugging, application-level memory-use attribution, and
>> deciding whether we want to kill large processes during system idle
>> maintenance windows. Android has been using PSS for these purposes for a
>> long time; as the average process VMA count has increased and and
>> devices become more efficiency-conscious, PSS-collection inefficiency
>> has started to matter more. IMHO, it'd be a lot safer to optimize the
>> existing PSS-collection model, which has been fine-tuned over the years,
>> instead of changing the memory tracking approach entirely to work around
>> smaps-generation inefficiency.
>
> This is really vague. Please be more specific.

I actually think this is really similar to the Chrome OS use case --
we need to do proper accounting of memory from user space, and we need
something more accurate than what we have now (usually RSS) to figure
it out.  I'm not sure what is vague about that statement?

PSS is not perfect but in closed systems where we have some knowledge
about what is being shared amongst process, PSS is much better than
RSS and readily available.  So, I disagree that this is a dubious
usage -- if there's a better metric for making this kind of decision,
please share it.

Also I realized there's another argument for presenting this
information outside of smaps which is that we expose far less
information about a process and it's address space via something like
this, so it's m

Re: [PATCH v4 RESEND 1/2] Documentation: tpm: add powered-while-suspended binding documentation

2017-07-05 Thread Sonny Rao

On Mon, Jul 3, 2017 at 5:57 AM, Jarkko Sakkinen
 wrote:
> On Tue, Jun 27, 2017 at 12:27:23PM +0200, Enric Balletbo i Serra wrote:
>> Add a new powered-while-suspended property to control the behavior of the
>> TPM suspend/resume.
>>
>> Signed-off-by: Enric Balletbo i Serra 
>> Signed-off-by: Sonny Rao 
>> Reviewed-by: Jason Gunthorpe 
>> Reviewed-by: Jarkko Sakkinen 
>> Acked-by: Rob Herring 
>> ---
>> Changes since v3.
>>   - Rebased on top of linux-next
>>  Rob Herring
>>   - Split DT binding from code patch as is preferred.
>>
>> Did not exist on previous versions.
>>
>>  Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt | 6 ++
>>  1 file changed, 6 insertions(+)
>>
>> diff --git a/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt 
>> b/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt
>> index 8cb638b..85c8216 100644
>> --- a/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt
>> +++ b/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt
>> @@ -8,6 +8,12 @@ Required properties:
>> the firmware event log
>>  - linux,sml-size : size of the memory allocated for the firmware event log
>>
>> +Optional properties:
>> +
>> +- powered-while-suspended: present when the TPM is left powered on between
>> +   suspend and resume (makes the suspend/resume
>> +   callbacks do nothing).
>> +
>>  Example (for OpenPower Systems with Nuvoton TPM 2.0 on I2C)
>>  --
>>
>> --
>> 2.9.3
>>
>
> So... should I apply this?

Hi, since you applied the code part, it would make sense to apply the
documentation too.

>
> /Jarkko

Re: [tpmdd-devel] [PATCH] tpm: do not suspend/resume if power stays on

2017-03-01 Thread Sonny Rao

On Wed, Mar 1, 2017 at 3:18 PM, Jason Gunthorpe
 wrote:
> On Wed, Mar 01, 2017 at 02:39:09PM -0800, Sonny Rao wrote:
>
>> > We recently added global suspend/resume callbacks to the TPM
>> > core. Those call backs do not power off the TPM, they just prepare its
>> > internal state to loose power to the chip. Skipping that process on
>> > hardware that does not power-off the TPM makes sense to me.
>> >
>> > But, Sonny, perhaps this should be a global flag in tpm_chip, not a
>> > per-interface-driver override?
>>
>> It's a property of the board design not the chip -- maybe I'm
>> misunderstanding?
>
> I mean do not add the code to handle this to tpm_i2c_infineon.c but in
> the common chip code instead.
>
> tpm_i2c_infineon.c should only parse DT properties that are relavent
> to the bus that delivers commands to the TPM, things that apply to how
> a TPM chip operates should be handled in the core code because they
> apply to any command transport bus.

Oh right, sorry -- yes this makes perfect sense.

>
> Jason

Re: [tpmdd-devel] [PATCH] tpm: do not suspend/resume if power stays on

2017-03-01 Thread Sonny Rao

On Wed, Mar 1, 2017 at 10:43 AM, Jason Gunthorpe
 wrote:
>> > +Optional properties:
>> > +- powered-while-suspended: present when the TPM is left powered on between
>> > +  suspend and resume (makes the suspend/resume callbacks do nothing).
>>
>> This reads like configuration rather than a HW property.
>
> I read this to mean the HW does not cut power to the TPM when Linux
> does 'suspend'.

That's correct, it is a hardware property describing whether power is
removed during suspend.

>
> We recently added global suspend/resume callbacks to the TPM
> core. Those call backs do not power off the TPM, they just prepare its
> internal state to loose power to the chip. Skipping that process on
> hardware that does not power-off the TPM makes sense to me.
>
> But, Sonny, perhaps this should be a global flag in tpm_chip, not a
> per-interface-driver override?

It's a property of the board design not the chip -- maybe I'm misunderstanding?

>
> Jason

Re: [PATCH v5 0/3] mm, proc: Implement /proc//totmaps

2016-09-19 Thread Sonny Rao

On Mon, Sep 19, 2016 at 5:27 PM, Robert Foss  wrote:
>
>
> On 2016-09-19 03:32 PM, Michal Hocko wrote:
>>
>> On Mon 19-09-16 11:16:31, Robert Foss wrote:
>>>
>>> On 2016-09-14 05:12 AM, Michal Hocko wrote:
>>>>
>>>> On Tue 13-09-16 13:27:39, Sonny Rao wrote:
>>
>> [...]
>>>>>
>>>>> Given that smaps
>>>>> doesn't provide this in a straightforward way, what do you think is
>>>>> the right way to provide this information?
>>>>
>>>>
>>>> I would be tempted to sneak it into /proc//statm because that looks
>>>> like a proper place but getting this information is not for free
>>>> performance wise so I am not really sure something that relies on this
>>>> file would see unexpected stalls. Maybe this could be worked around by
>>>> some caching... I would suggest to check who is actually using this file
>>>> (top/ps etc...)
>>>
>>>
>>> What would this caching look like? Can any information be re-used between
>>> vma walks?
>>
>>
>> yes basically return the same value if called within HZ or something
>> similar. But that assumes that statm latency really matters and it is
>> called often enough.
>
>
> Any single application querying more often than HZ, would presumably do so
> for accuracy reasons.
> However for multiple applications that combined query more often than HZ,
> this would most definitely be halpful in terms of performance.
>
> @Sonny, does chromiumos fall into the first or second category?

It's a single application -- and it definitely doesn't query at HZ --
especially given how long it takes to gather the data :-)

Re: [PATCH v5 0/3] mm, proc: Implement /proc//totmaps

2016-09-19 Thread Sonny Rao

On Mon, Sep 19, 2016 at 12:56 PM, Jann Horn  wrote:
> On Mon, Sep 19, 2016 at 09:51:13PM +0200, Michal Hocko wrote:
>> [not sure why the CC list was trimmed - do no do that please unless you
>>  have a strong reason for that - if this was not intentional please
>>  restpre it]
>
> Ah, sorry, pressed the wrong key.
>
>
>> On Mon 19-09-16 21:40:01, Jann Horn wrote:
>> > On Mon, Sep 19, 2016 at 09:32:38PM +0200, Michal Hocko wrote:
>> > > On Mon 19-09-16 11:16:31, Robert Foss wrote:
>> > > > On 2016-09-14 05:12 AM, Michal Hocko wrote:
>> > > > > On Tue 13-09-16 13:27:39, Sonny Rao wrote:
>> > > [...]
>> > > > > > Given that smaps
>> > > > > > doesn't provide this in a straightforward way, what do you think is
>> > > > > > the right way to provide this information?
>> > > > >
>> > > > > I would be tempted to sneak it into /proc//statm because that 
>> > > > > looks
>> > > > > like a proper place but getting this information is not for free
>> > > > > performance wise so I am not really sure something that relies on 
>> > > > > this
>> > > > > file would see unexpected stalls. Maybe this could be worked around 
>> > > > > by
>> > > > > some caching... I would suggest to check who is actually using this 
>> > > > > file
>> > > > > (top/ps etc...)
>> > > >
>> > > > What would this caching look like? Can any information be re-used 
>> > > > between
>> > > > vma walks?
>> > >
>> > > yes basically return the same value if called within HZ or something
>> > > similar. But that assumes that statm latency really matters and it is
>> > > called often enough.
>> >
>> > That sounds horrible. If some application decides that they want to check
>> > statm directly after some action or so (like after program startup), this 
>> > is
>> > going to give them a very bad time. That probably doesn't happen
>> > often - but still.
>> >
>> > I can already imagine some developer going "yeah, that usleep()... that's
>> > because the kernel API returns stale information for a couple milliseconds
>> > after we do something *shrug*".
>> >
>> > What are you trying to optimize for? Ten users on the same machine, each of
>> > which is running "top" because it looks so great?
>>
>> Please try to read what I wrote again. I didn't say this would be
>> needed. The idea was that _if_ /proc//statm is used very _often_
>> than some caching might help to reduce the overhead. Especially when you
>> consider that the information is not precise anyway. It can change
>> anytime while you are doing the address space walk.

Just thinking out loud here -- I haven't looked closely at the code so
please bear with me :-)

Instead of checking when the last read was and returning old data,
what about a scheme where we still have a timestamp for last stat read
on and any changes to that address space invalidate the timestamp.

The invalidation could be racy because we're not too concerned about
immediate accuracy -- so just a write.   The main issue I could see
which this is that it could cause the cacheline holding this timestamp
to bounce around a lot?  Maybe there's an existing solution in the
page table locking that could be leveraged here to at least maintain
whatever scalability enhancements are present for this type of
situation where there are many updates happening in parallel.

Re: [PATCH v5 0/3] mm, proc: Implement /proc//totmaps

2016-09-13 Thread Sonny Rao

On Tue, Sep 13, 2016 at 12:12 AM, Michal Hocko  wrote:
> On Mon 12-09-16 10:28:53, Sonny Rao wrote:
>> On Mon, Sep 12, 2016 at 10:15 AM, Michal Hocko  wrote:
>> > On Mon 12-09-16 08:31:36, Sonny Rao wrote:
> [...]
>> >> but how about the other fields like Swap, Private_Dirty and
>> >> Private_Shared?
>> >
>> > Private_Shared can be pretty confusing as well without the whole context
>> > as well see my other emails in the original thread (just to remind
>> > shmem/tmpfs makes all this really confusing).
>>
>> But this is exactly the issue -- RSS is can be just as confusing if
>> you don't know something about the application.
>
> I agree that rss can be confusing but we will not make the situation any
> better if we add yet another confusing metric.
>
>> I think the issue is
>> how common that situation is, and you seem to believe that it's so
>> uncommon that it's actually better to keep the information more
>> difficult to get for those of us who know something about our systems.
>>
>> That's fine, I guess we just have to disagree here, thanks for look at this.
>
> I think you should just step back and think more about what exactly
> you expect from the counter(s). I believe what you want is an
> estimate of a freeable memory when the particular process dies or is
> killed. That would mean resident single mapped private anonymous memory
> + unlinked single mapped shareable mappings + single mapped swapped out
> memory. Maybe I've missed something but it should be something along
> those lines. Definitely something that the current smaps infrastructure
> doesn't give you, though.

Yes your description of what we want is pretty good.  Having a
reasonable lower bound on the estimate is fine, though we probably
want to break out swapped out memory separately.  Given that smaps
doesn't provide this in a straightforward way, what do you think is
the right way to provide this information?

> --
> Michal Hocko
> SUSE Labs

Re: [PATCH v5 0/3] mm, proc: Implement /proc//totmaps

2016-09-12 Thread Sonny Rao

On Mon, Sep 12, 2016 at 10:15 AM, Michal Hocko  wrote:
> On Mon 12-09-16 08:31:36, Sonny Rao wrote:
>> On Mon, Sep 12, 2016 at 5:02 AM, Michal Hocko  wrote:
>> > On Mon 05-09-16 16:14:06, robert.f...@collabora.com wrote:
>> >> From: Robert Foss 
>> >>
>> >> This series provides the /proc/PID/totmaps feature, which
>> >> summarizes the information provided by /proc/PID/smaps for
>> >> improved performance and usability reasons.
>> >>
>> >> A use case is to speed up monitoring of memory consumption in
>> >> environments where RSS isn't precise.
>> >>
>> >> For example Chrome tends to many processes which have hundreds of VMAs
>> >> with a substantial amount of shared memory, and the error of using
>> >> RSS rather than PSS tends to be very large when looking at overall
>> >> memory consumption.  PSS isn't kept as a single number that's exported
>> >> like RSS, so to calculate PSS means having to parse a very large smaps
>> >> file.
>> >>
>> >> This process is slow and has to be repeated for many processes, and we
>> >> found that the just act of doing the parsing was taking up a
>> >> significant amount of CPU time, so this patch is an attempt to make
>> >> that process cheaper.
>> >
>> > I still maintain my concerns about a single pss value. It might work in
>> > a very specific situations where the consumer knows what is shared but
>> > other than that the value can be more misleading than helpful. So a NACK
>> > from me until I am shown that this is usable in general and still
>> > helpful.
>>
>> I know you think Pss isn't useful in general (though I'll point out
>> two other independent people said they found it useful)
>
> sure, and one of them admitted that the value is useful because they
> _know_ the resource. The other was quite vague for me to understand
> all the details. Please, try to understand that once you provide a user
> API then it will carved in stone. If the interface is poor and ambigous
> it will bite us later. One very specific usecase doesn't justify
> something that might be really misleading for 90% of cases.
>
>> but how about the other fields like Swap, Private_Dirty and
>> Private_Shared?
>
> Private_Shared can be pretty confusing as well without the whole context
> as well see my other emails in the original thread (just to remind
> shmem/tmpfs makes all this really confusing).

But this is exactly the issue -- RSS is can be just as confusing if
you don't know something about the application.  I think the issue is
how common that situation is, and you seem to believe that it's so
uncommon that it's actually better to keep the information more
difficult to get for those of us who know something about our systems.

That's fine, I guess we just have to disagree here, thanks for look at this.

>
> --
> Michal Hocko
> SUSE Labs

Re: [PATCH v5 0/3] mm, proc: Implement /proc//totmaps

2016-09-12 Thread Sonny Rao

On Mon, Sep 12, 2016 at 5:02 AM, Michal Hocko  wrote:
> On Mon 05-09-16 16:14:06, robert.f...@collabora.com wrote:
>> From: Robert Foss 
>>
>> This series provides the /proc/PID/totmaps feature, which
>> summarizes the information provided by /proc/PID/smaps for
>> improved performance and usability reasons.
>>
>> A use case is to speed up monitoring of memory consumption in
>> environments where RSS isn't precise.
>>
>> For example Chrome tends to many processes which have hundreds of VMAs
>> with a substantial amount of shared memory, and the error of using
>> RSS rather than PSS tends to be very large when looking at overall
>> memory consumption.  PSS isn't kept as a single number that's exported
>> like RSS, so to calculate PSS means having to parse a very large smaps
>> file.
>>
>> This process is slow and has to be repeated for many processes, and we
>> found that the just act of doing the parsing was taking up a
>> significant amount of CPU time, so this patch is an attempt to make
>> that process cheaper.
>
> I still maintain my concerns about a single pss value. It might work in
> a very specific situations where the consumer knows what is shared but
> other than that the value can be more misleading than helpful. So a NACK
> from me until I am shown that this is usable in general and still
> helpful.

I know you think Pss isn't useful in general (though I'll point out
two other independent people said they found it useful) but how about
the other fields like Swap, Private_Dirty and Private_Shared?

If we removed Pss would you still NACK it?

>
> --
> Michal Hocko
> SUSE Labs

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-22 Thread Sonny Rao

On Mon, Aug 22, 2016 at 12:54 AM, Michal Hocko  wrote:
> On Fri 19-08-16 10:57:48, Sonny Rao wrote:
>> On Fri, Aug 19, 2016 at 12:59 AM, Michal Hocko  wrote:
>> > On Thu 18-08-16 23:43:39, Sonny Rao wrote:
>> >> On Thu, Aug 18, 2016 at 11:01 AM, Michal Hocko  wrote:
>> >> > On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>> >> >> On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko  
>> >> >> wrote:
>> >> >> > On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>> >> > [...]
>> >> >> >> 2) User space OOM handling -- we'd rather do a more graceful 
>> >> >> >> shutdown
>> >> >> >> than let the kernel's OOM killer activate and need to gather this
>> >> >> >> information and we'd like to be able to get this information to make
>> >> >> >> the decision much faster than 400ms
>> >> >> >
>> >> >> > Global OOM handling in userspace is really dubious if you ask me. I
>> >> >> > understand you want something better than SIGKILL and in fact this is
>> >> >> > already possible with memory cgroup controller (btw. memcg will give
>> >> >> > you a cheap access to rss, amount of shared, swapped out memory as
>> >> >> > well). Anyway if you are getting close to the OOM your system will 
>> >> >> > most
>> >> >> > probably be really busy and chances are that also reading your new 
>> >> >> > file
>> >> >> > will take much more time. I am also not quite sure how is pss useful 
>> >> >> > for
>> >> >> > oom decisions.
>> >> >>
>> >> >> I mentioned it before, but based on experience RSS just isn't good
>> >> >> enough -- there's too much sharing going on in our use case to make
>> >> >> the correct decision based on RSS.  If RSS were good enough, simply
>> >> >> put, this patch wouldn't exist.
>> >> >
>> >> > But that doesn't answer my question, I am afraid. So how exactly do you
>> >> > use pss for oom decisions?
>> >>
>> >> We use PSS to calculate the memory used by a process among all the
>> >> processes in the system, in the case of Chrome this tells us how much
>> >> each renderer process (which is roughly tied to a particular "tab" in
>> >> Chrome) is using and how much it has swapped out, so we know what the
>> >> worst offenders are -- I'm not sure what's unclear about that?
>> >
>> > So let me ask more specifically. How can you make any decision based on
>> > the pss when you do not know _what_ is the shared resource. In other
>> > words if you select a task to terminate based on the pss then you have to
>> > kill others who share the same resource otherwise you do not release
>> > that shared resource. Not to mention that such a shared resource might
>> > be on tmpfs/shmem and it won't get released even after all processes
>> > which map it are gone.
>>
>> Ok I see why you're confused now, sorry.
>>
>> In our case that we do know what is being shared in general because
>> the sharing is mostly between those processes that we're looking at
>> and not other random processes or tmpfs, so PSS gives us useful data
>> in the context of these processes which are sharing the data
>> especially for monitoring between the set of these renderer processes.
>
> OK, I see and agree that pss might be useful when you _know_ what is
> shared. But this sounds quite specific to a particular workload. How
> many users are in a similar situation? In other words, if we present
> a single number without the context, how much useful it will be in
> general? Is it possible that presenting such a number could be even
> misleading for somebody who doesn't have an idea which resources are
> shared? These are all questions which should be answered before we
> actually add this number (be it a new/existing proc file or a syscall).
> I still believe that the number without wider context is just not all
> that useful.


I see the specific point about  PSS -- because you need to know what
is being shared or otherwise use it in a whole system context, but I
still think the whole system context is a valid and generally useful
thing.  But what about the private_clean and private_dirty?  Surely
those are more generally useful for calculating a lower bound on
pro

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-19 Thread Sonny Rao

On Fri, Aug 19, 2016 at 1:05 AM, Michal Hocko  wrote:
> On Fri 19-08-16 11:26:34, Minchan Kim wrote:
>> Hi Michal,
>>
>> On Thu, Aug 18, 2016 at 08:01:04PM +0200, Michal Hocko wrote:
>> > On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>> > > On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko  wrote:
>> > > > On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>> > [...]
>> > > >> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>> > > >> than let the kernel's OOM killer activate and need to gather this
>> > > >> information and we'd like to be able to get this information to make
>> > > >> the decision much faster than 400ms
>> > > >
>> > > > Global OOM handling in userspace is really dubious if you ask me. I
>> > > > understand you want something better than SIGKILL and in fact this is
>> > > > already possible with memory cgroup controller (btw. memcg will give
>> > > > you a cheap access to rss, amount of shared, swapped out memory as
>> > > > well). Anyway if you are getting close to the OOM your system will most
>> > > > probably be really busy and chances are that also reading your new file
>> > > > will take much more time. I am also not quite sure how is pss useful 
>> > > > for
>> > > > oom decisions.
>> > >
>> > > I mentioned it before, but based on experience RSS just isn't good
>> > > enough -- there's too much sharing going on in our use case to make
>> > > the correct decision based on RSS.  If RSS were good enough, simply
>> > > put, this patch wouldn't exist.
>> >
>> > But that doesn't answer my question, I am afraid. So how exactly do you
>> > use pss for oom decisions?
>>
>> My case is not for OOM decision but I agree it would be great if we can get
>> *fast* smap summary information.
>>
>> PSS is really great tool to figure out how processes consume memory
>> more exactly rather than RSS. We have been used it for monitoring
>> of memory for per-process. Although it is not used for OOM decision,
>> it would be great if it is speed up because we don't want to spend
>> many CPU time for just monitoring.
>>
>> For our usecase, we don't need AnonHugePages, ShmemPmdMapped, Shared_Hugetlb,
>> Private_Hugetlb, KernelPageSize, MMUPageSize because we never enable THP and
>> hugetlb. Additionally, Locked can be known via vma flags so we don't need it,
>> either. Even, we don't need address range for just monitoring when we don't
>> investigate in detail.
>>
>> Although they are not severe overhead, why does it emit the useless
>> information? Even bloat day by day. :( With that, userspace tools should
>> spend more time to parse which is pointless.
>
> So far it doesn't really seem that the parsing is the biggest problem.
> The major cycles killer is the output formatting and that doesn't sound
> like a problem we are not able to address. And I would even argue that
> we want to address it in a generic way as much as possible.
>
>> Having said that, I'm not fan of creating new stat knob for that, either.
>> How about appending summary information in the end of smap?
>> So, monitoring users can just open the file and lseek to the (end - 1) and
>> read the summary only.
>
> That might confuse existing parsers. Besides that we already have
> /proc//statm which gives cumulative numbers already. I am not sure
> how often it is used and whether the pte walk is too expensive for
> existing users but that should be explored and evaluated before a new
> file is created.
>
> The /proc became a dump of everything people found interesting just
> because we were to easy to allow those additions. Do not repeat those
> mistakes, please!

Another thing I noticed was that we lock down smaps on Chromium OS.  I
think this is to avoid exposing more information than necessary via
proc.  The totmaps file gives us just the information we need and
nothing else.   I certainly don't think we need a proc file for this
use case -- do you think a new system call is better or something
else?

> --
> Michal Hocko
> SUSE Labs

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-19 Thread Sonny Rao

On Fri, Aug 19, 2016 at 12:59 AM, Michal Hocko  wrote:
> On Thu 18-08-16 23:43:39, Sonny Rao wrote:
>> On Thu, Aug 18, 2016 at 11:01 AM, Michal Hocko  wrote:
>> > On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>> >> On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko  wrote:
>> >> > On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>> > [...]
>> >> >> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>> >> >> than let the kernel's OOM killer activate and need to gather this
>> >> >> information and we'd like to be able to get this information to make
>> >> >> the decision much faster than 400ms
>> >> >
>> >> > Global OOM handling in userspace is really dubious if you ask me. I
>> >> > understand you want something better than SIGKILL and in fact this is
>> >> > already possible with memory cgroup controller (btw. memcg will give
>> >> > you a cheap access to rss, amount of shared, swapped out memory as
>> >> > well). Anyway if you are getting close to the OOM your system will most
>> >> > probably be really busy and chances are that also reading your new file
>> >> > will take much more time. I am also not quite sure how is pss useful for
>> >> > oom decisions.
>> >>
>> >> I mentioned it before, but based on experience RSS just isn't good
>> >> enough -- there's too much sharing going on in our use case to make
>> >> the correct decision based on RSS.  If RSS were good enough, simply
>> >> put, this patch wouldn't exist.
>> >
>> > But that doesn't answer my question, I am afraid. So how exactly do you
>> > use pss for oom decisions?
>>
>> We use PSS to calculate the memory used by a process among all the
>> processes in the system, in the case of Chrome this tells us how much
>> each renderer process (which is roughly tied to a particular "tab" in
>> Chrome) is using and how much it has swapped out, so we know what the
>> worst offenders are -- I'm not sure what's unclear about that?
>
> So let me ask more specifically. How can you make any decision based on
> the pss when you do not know _what_ is the shared resource. In other
> words if you select a task to terminate based on the pss then you have to
> kill others who share the same resource otherwise you do not release
> that shared resource. Not to mention that such a shared resource might
> be on tmpfs/shmem and it won't get released even after all processes
> which map it are gone.

Ok I see why you're confused now, sorry.

In our case that we do know what is being shared in general because
the sharing is mostly between those processes that we're looking at
and not other random processes or tmpfs, so PSS gives us useful data
in the context of these processes which are sharing the data
especially for monitoring between the set of these renderer processes.

We also use the private clean and private dirty and swap fields to
make a few metrics for the processes and charge each process for it's
private, shared, and swap data. Private clean and dirty are used for
estimating a lower bound on how much memory would be freed.  Swap and
PSS also give us some indication of additional memory which might get
freed up.

>
> I am sorry for being dense but it is still not clear to me how the
> single pss number can be used for oom or, in general, any serious
> decisions. The counter might be useful of course for debugging purposes
> or to have a general overview but then arguing about 40 vs 20ms sounds a
> bit strange to me.

Yeah so it's more than just the single PSS number, it's PSS,
Private_Clean, Private_dirty, Swap are all interesting numbers to make
these decisions.

>
>> Chrome tends to use a lot of shared memory so we found PSS to be
>> better than RSS, and I can give you examples of the  RSS and PSS on
>> real systems to illustrate the magnitude of the difference between
>> those two numbers if that would be useful.
>>
>> >
>> >> So even with memcg I think we'd have the same problem?
>> >
>> > memcg will give you instant anon, shared counters for all processes in
>> > the memcg.
>> >
>>
>> We want to be able to get per-process granularity quickly.  I'm not
>> sure if memcg provides that exactly?
>
> I will give you that information if you do process-per-memcg but that
> doesn't sound ideal. I thought those 20-something processes you were
> talking about are treated together but it seems I misunderstood.
> --
> Michal Hocko
> SUSE Labs

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-18 Thread Sonny Rao

On Thu, Aug 18, 2016 at 7:26 PM, Minchan Kim  wrote:
> Hi Michal,
>
> On Thu, Aug 18, 2016 at 08:01:04PM +0200, Michal Hocko wrote:
>> On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>> > On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko  wrote:
>> > > On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>> [...]
>> > >> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>> > >> than let the kernel's OOM killer activate and need to gather this
>> > >> information and we'd like to be able to get this information to make
>> > >> the decision much faster than 400ms
>> > >
>> > > Global OOM handling in userspace is really dubious if you ask me. I
>> > > understand you want something better than SIGKILL and in fact this is
>> > > already possible with memory cgroup controller (btw. memcg will give
>> > > you a cheap access to rss, amount of shared, swapped out memory as
>> > > well). Anyway if you are getting close to the OOM your system will most
>> > > probably be really busy and chances are that also reading your new file
>> > > will take much more time. I am also not quite sure how is pss useful for
>> > > oom decisions.
>> >
>> > I mentioned it before, but based on experience RSS just isn't good
>> > enough -- there's too much sharing going on in our use case to make
>> > the correct decision based on RSS.  If RSS were good enough, simply
>> > put, this patch wouldn't exist.
>>
>> But that doesn't answer my question, I am afraid. So how exactly do you
>> use pss for oom decisions?
>
> My case is not for OOM decision but I agree it would be great if we can get
> *fast* smap summary information.
>
> PSS is really great tool to figure out how processes consume memory
> more exactly rather than RSS. We have been used it for monitoring
> of memory for per-process. Although it is not used for OOM decision,
> it would be great if it is speed up because we don't want to spend
> many CPU time for just monitoring.
>
> For our usecase, we don't need AnonHugePages, ShmemPmdMapped, Shared_Hugetlb,
> Private_Hugetlb, KernelPageSize, MMUPageSize because we never enable THP and
> hugetlb. Additionally, Locked can be known via vma flags so we don't need it,
> either. Even, we don't need address range for just monitoring when we don't
> investigate in detail.
>
> Although they are not severe overhead, why does it emit the useless
> information? Even bloat day by day. :( With that, userspace tools should
> spend more time to parse which is pointless.
>
> Having said that, I'm not fan of creating new stat knob for that, either.
> How about appending summary information in the end of smap?
> So, monitoring users can just open the file and lseek to the (end - 1) and
> read the summary only.
>

That would work fine for us as long as it's fast -- i.e. we don't
still have to do all the expensive per-VMA format conversion in the
kernel.

> Thanks.

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-18 Thread Sonny Rao

On Thu, Aug 18, 2016 at 11:01 AM, Michal Hocko  wrote:
> On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>> On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko  wrote:
>> > On Wed 17-08-16 11:57:56, Sonny Rao wrote:
> [...]
>> >> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>> >> than let the kernel's OOM killer activate and need to gather this
>> >> information and we'd like to be able to get this information to make
>> >> the decision much faster than 400ms
>> >
>> > Global OOM handling in userspace is really dubious if you ask me. I
>> > understand you want something better than SIGKILL and in fact this is
>> > already possible with memory cgroup controller (btw. memcg will give
>> > you a cheap access to rss, amount of shared, swapped out memory as
>> > well). Anyway if you are getting close to the OOM your system will most
>> > probably be really busy and chances are that also reading your new file
>> > will take much more time. I am also not quite sure how is pss useful for
>> > oom decisions.
>>
>> I mentioned it before, but based on experience RSS just isn't good
>> enough -- there's too much sharing going on in our use case to make
>> the correct decision based on RSS.  If RSS were good enough, simply
>> put, this patch wouldn't exist.
>
> But that doesn't answer my question, I am afraid. So how exactly do you
> use pss for oom decisions?

We use PSS to calculate the memory used by a process among all the
processes in the system, in the case of Chrome this tells us how much
each renderer process (which is roughly tied to a particular "tab" in
Chrome) is using and how much it has swapped out, so we know what the
worst offenders are -- I'm not sure what's unclear about that?

Chrome tends to use a lot of shared memory so we found PSS to be
better than RSS, and I can give you examples of the  RSS and PSS on
real systems to illustrate the magnitude of the difference between
those two numbers if that would be useful.

>
>> So even with memcg I think we'd have the same problem?
>
> memcg will give you instant anon, shared counters for all processes in
> the memcg.
>

We want to be able to get per-process granularity quickly.  I'm not
sure if memcg provides that exactly?

>> > Don't take me wrong, /proc//totmaps might be suitable for your
>> > specific usecase but so far I haven't heard any sound argument for it to
>> > be generally usable. It is true that smaps is unnecessarily costly but
>> > at least I can see some room for improvements. A simple patch I've
>> > posted cut the formatting overhead by 7%. Maybe we can do more.
>>
>> It seems like a general problem that if you want these values the
>> existing kernel interface can be very expensive, so it would be
>> generally usable by any application which wants a per process PSS,
>> private data, dirty data or swap value.
>
> yes this is really unfortunate. And if at all possible we should address
> that. Precise values require the expensive rmap walk. We can introduce
> some caching to help that. But so far it seems the biggest overhead is
> to simply format the output and that should be addressed before any new
> proc file is added.
>
>> I mentioned two use cases, but I guess I don't understand the comment
>> about why it's not usable by other use cases.
>
> I might be wrong here but a use of pss is quite limited and I do not
> remember anybody asking for large optimizations in that area. I still do
> not understand your use cases properly so I am quite skeptical about a
> general usefulness of a new file.

How do you know that usage of PSS is quite limited?  I can only say
that we've been using it on Chromium OS for at least four years and
have found it very valuable, and I think I've explained the use cases
in this thread. If you have more specific questions then I can try to
clarify.

>
> --
> Michal Hocko
> SUSE Labs

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-18 Thread Sonny Rao

On Thu, Aug 18, 2016 at 2:05 PM, Robert Foss  wrote:
>
>
> On 2016-08-18 02:01 PM, Michal Hocko wrote:
>>
>> On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>>>
>>> On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko  wrote:
>>>>
>>>> On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>>
>> [...]
>>>>>
>>>>> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>>>>> than let the kernel's OOM killer activate and need to gather this
>>>>> information and we'd like to be able to get this information to make
>>>>> the decision much faster than 400ms
>>>>
>>>>
>>>> Global OOM handling in userspace is really dubious if you ask me. I
>>>> understand you want something better than SIGKILL and in fact this is
>>>> already possible with memory cgroup controller (btw. memcg will give
>>>> you a cheap access to rss, amount of shared, swapped out memory as
>>>> well). Anyway if you are getting close to the OOM your system will most
>>>> probably be really busy and chances are that also reading your new file
>>>> will take much more time. I am also not quite sure how is pss useful for
>>>> oom decisions.
>>>
>>>
>>> I mentioned it before, but based on experience RSS just isn't good
>>> enough -- there's too much sharing going on in our use case to make
>>> the correct decision based on RSS.  If RSS were good enough, simply
>>> put, this patch wouldn't exist.
>>
>>
>> But that doesn't answer my question, I am afraid. So how exactly do you
>> use pss for oom decisions?
>>
>>> So even with memcg I think we'd have the same problem?
>>
>>
>> memcg will give you instant anon, shared counters for all processes in
>> the memcg.
>
>
> Is it technically feasible to add instant pss support to memcg?
>
> @Sonny Rao: Would using cgroups be acceptable for chromiumos?

It's possible, though I think we'd end up putting each renderer in
it's own cgroup to get the PSS stat, so it seems a bit like overkill.
I think memcg also has some overhead that we'd need to quantify but I
could be mistaken about this.

>
>
>>
>>>> Don't take me wrong, /proc//totmaps might be suitable for your
>>>> specific usecase but so far I haven't heard any sound argument for it to
>>>> be generally usable. It is true that smaps is unnecessarily costly but
>>>> at least I can see some room for improvements. A simple patch I've
>>>> posted cut the formatting overhead by 7%. Maybe we can do more.
>>>
>>>
>>> It seems like a general problem that if you want these values the
>>> existing kernel interface can be very expensive, so it would be
>>> generally usable by any application which wants a per process PSS,
>>> private data, dirty data or swap value.
>>
>>
>> yes this is really unfortunate. And if at all possible we should address
>> that. Precise values require the expensive rmap walk. We can introduce
>> some caching to help that. But so far it seems the biggest overhead is
>> to simply format the output and that should be addressed before any new
>> proc file is added.
>>
>>> I mentioned two use cases, but I guess I don't understand the comment
>>> about why it's not usable by other use cases.
>>
>>
>> I might be wrong here but a use of pss is quite limited and I do not
>> remember anybody asking for large optimizations in that area. I still do
>> not understand your use cases properly so I am quite skeptical about a
>> general usefulness of a new file.
>>
>

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-18 Thread Sonny Rao

On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko  wrote:
> On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>> On Wed, Aug 17, 2016 at 6:03 AM, Michal Hocko  wrote:
>> > On Wed 17-08-16 11:31:25, Jann Horn wrote:
> [...]
>> >> That's at least 30.43% + 9.12% + 7.66% = 47.21% of the task's kernel
>> >> time spent on evaluating format strings. The new interface
>> >> wouldn't have to spend that much time on format strings because there
>> >> isn't so much text to format.
>> >
>> > well, this is true of course but I would much rather try to reduce the
>> > overhead of smaps file than add a new file. The following should help
>> > already. I've measured ~7% systime cut down. I guess there is still some
>> > room for improvements but I have to say I'm far from being convinced about
>> > a new proc file just because we suck at dumping information to the
>> > userspace.
>> > If this was something like /proc//stat which is
>> > essentially read all the time then it would be a different question but
>> > is the rss, pss going to be all that often? If yes why?
>>
>> If the question is why do we need to read RSS, PSS, Private_*, Swap
>> and the other fields so often?
>>
>> I have two use cases so far involving monitoring per-process memory
>> usage, and we usually need to read stats for about 25 processes.
>>
>> Here's a timing example on an fairly recent ARM system 4 core RK3288
>> running at 1.8Ghz
>>
>> localhost ~ # time cat /proc/25946/smaps > /dev/null
>>
>> real0m0.036s
>> user0m0.020s
>> sys 0m0.020s
>>
>> localhost ~ # time cat /proc/25946/totmaps > /dev/null
>>
>> real0m0.027s
>> user0m0.010s
>> sys 0m0.010s
>> localhost ~ #
>>
>> I'll ignore the user time for now, and we see about 20 ms of system
>> time with smaps and 10 ms with totmaps, with 20 similar processes it
>> would be 400 milliseconds of cpu time for the kernel to get this
>> information from smaps vs 200 milliseconds with totmaps.  Even totmaps
>> is still pretty slow, but much better than smaps.
>>
>> Use cases:
>> 1) Basic task monitoring -- like "top" that shows memory consumption
>> including PSS, Private, Swap
>> 1 second update means about 40% of one CPU is spent in the kernel
>> gathering the data with smaps
>
> I would argue that even 20% is way too much for such a monitoring. What
> is the value to do it so often tha 20 vs 40ms really matters?

Yeah it is too much (I believe I said that) but it's significantly better.

>> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>> than let the kernel's OOM killer activate and need to gather this
>> information and we'd like to be able to get this information to make
>> the decision much faster than 400ms
>
> Global OOM handling in userspace is really dubious if you ask me. I
> understand you want something better than SIGKILL and in fact this is
> already possible with memory cgroup controller (btw. memcg will give
> you a cheap access to rss, amount of shared, swapped out memory as
> well). Anyway if you are getting close to the OOM your system will most
> probably be really busy and chances are that also reading your new file
> will take much more time. I am also not quite sure how is pss useful for
> oom decisions.

I mentioned it before, but based on experience RSS just isn't good
enough -- there's too much sharing going on in our use case to make
the correct decision based on RSS.  If RSS were good enough, simply
put, this patch wouldn't exist.  So even with memcg I think we'd have
the same problem?

>
> Don't take me wrong, /proc//totmaps might be suitable for your
> specific usecase but so far I haven't heard any sound argument for it to
> be generally usable. It is true that smaps is unnecessarily costly but
> at least I can see some room for improvements. A simple patch I've
> posted cut the formatting overhead by 7%. Maybe we can do more.

It seems like a general problem that if you want these values the
existing kernel interface can be very expensive, so it would be
generally usable by any application which wants a per process PSS,
private data, dirty data or swap value.   I mentioned two use cases,
but I guess I don't understand the comment about why it's not usable
by other use cases.

> --
> Michal Hocko
> SUSE Labs

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-17 Thread Sonny Rao

On Wed, Aug 17, 2016 at 6:03 AM, Michal Hocko  wrote:
> On Wed 17-08-16 11:31:25, Jann Horn wrote:
>> On Wed, Aug 17, 2016 at 10:22:00AM +0200, Michal Hocko wrote:
>> > On Tue 16-08-16 12:46:51, Robert Foss wrote:
>> > [...]
>> > > $ /usr/bin/time -v -p zsh -c "repeat 25 { awk '/^Rss/{rss+=\$2}
>> > > /^Pss/{pss+=\$2} END {printf \"rss:%d pss:%d\n\", rss, pss}\'
>> > > /proc/5025/smaps }"
>> > > [...]
>> > >   Command being timed: "zsh -c repeat 25 { awk '/^Rss/{rss+=$2}
>> > > /^Pss/{pss+=$2} END {printf "rss:%d pss:%d\n", rss, pss}\' 
>> > > /proc/5025/smaps
>> > > }"
>> > >   User time (seconds): 0.37
>> > >   System time (seconds): 0.45
>> > >   Percent of CPU this job got: 92%
>> > >   Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.89
>> >
>> > This is really unexpected. Where is the user time spent? Anyway, rather
>> > than measuring some random processes I've tried to measure something
>> > resembling the worst case. So I've created a simple program to mmap as
>> > much as possible:
>> >
>> > #include 
>> > #include 
>> > #include 
>> > #include 
>> > int main()
>> > {
>> > while (mmap(NULL, 4096, PROT_READ|PROT_WRITE, 
>> > MAP_ANON|MAP_SHARED|MAP_POPULATE, -1, 0) != MAP_FAILED)
>> > ;
>> >
>> > printf("pid:%d\n", getpid());
>> > pause();
>> > return 0;
>> > }
>>
>> Ah, nice, that's a reasonable test program. :)
>>
>>
>> > So with a reasonable user space the parsing is really not all that time
>> > consuming wrt. smaps handling. That being said I am still very skeptical
>> > about a dedicated proc file which accomplishes what userspace can done
>> > in a trivial way.
>>
>> Now, since your numbers showed that all the time is spent in the kernel,
>> also create this test program to just read that file over and over again:
>>
>> $ cat justreadloop.c
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>>
>> char buf[100];
>>
>> int main(int argc, char **argv) {
>>   printf("pid:%d\n", getpid());
>>   while (1) {
>> int fd = open(argv[1], O_RDONLY);
>> if (fd < 0) continue;
>> if (read(fd, buf, sizeof(buf)) < 0)
>>   err(1, "read");
>> close(fd);
>>   }
>> }
>> $ gcc -Wall -o justreadloop justreadloop.c
>> $
>>
>> Now launch your test:
>>
>> $ ./mapstuff
>> pid:29397
>>
>> point justreadloop at it:
>>
>> $ ./justreadloop /proc/29397/smaps
>> pid:32567
>>
>> ... and then check the performance stats of justreadloop:
>>
>> # perf top -p 32567
>>
>> This is what I see:
>>
>> Samples: 232K of event 'cycles:ppp', Event count (approx.): 60448424325
>> Overhead  Shared Object Symbol
>>   30,43%  [kernel]  [k] format_decode
>>9,12%  [kernel]  [k] number
>>7,66%  [kernel]  [k] vsnprintf
>>7,06%  [kernel]  [k] __lock_acquire
>>3,23%  [kernel]  [k] lock_release
>>2,85%  [kernel]  [k] debug_lockdep_rcu_enabled
>>2,25%  [kernel]  [k] skip_atoi
>>2,13%  [kernel]  [k] lock_acquire
>>2,05%  [kernel]  [k] show_smap
>
> This is a lot! I would expect the rmap walk to consume more but it even
> doesn't show up in the top consumers.
>
>> That's at least 30.43% + 9.12% + 7.66% = 47.21% of the task's kernel
>> time spent on evaluating format strings. The new interface
>> wouldn't have to spend that much time on format strings because there
>> isn't so much text to format.
>
> well, this is true of course but I would much rather try to reduce the
> overhead of smaps file than add a new file. The following should help
> already. I've measured ~7% systime cut down. I guess there is still some
> room for improvements but I have to say I'm far from being convinced about
> a new proc file just because we suck at dumping information to the
> userspace.
> If this was something like /proc//stat which is
> essentially read all the time then it would be a different question but
> is the rss, pss going to be all that often? If yes why?

If the question is why do we need to read RSS, PSS, Private_*, Swap
and the other fields so often?

I have two use cases so far involving monitoring per-process memory
usage, and we usually need to read stats for about 25 processes.

Here's a timing example on an fairly recent ARM system 4 core RK3288
running at 1.8Ghz

localhost ~ # time cat /proc/25946/smaps > /dev/null

real0m0.036s
user0m0.020s
sys 0m0.020s

localhost ~ # time cat /proc/25946/totmaps > /dev/null

real0m0.027s
user0m0.010s
sys 0m0.010s
localhost ~ #

I'll ignore the user time for now, and we see about 20 ms of system
time with smaps and 10 ms with totmaps, with 20 similar processes it
would be 400 milliseconds of cpu time for the kernel to get this
information from smaps vs 200 milliseconds with totmaps.  Even totmaps
is still pretty slow, but much better than smaps.

Use cases:
1) Basic task monitoring -- like "top" that shows memory consumption
including PSS, Private, Swap
1 second update means about 40% of o

Re: [PACTH v1] mm, proc: Implement /proc//totmaps

2016-08-10 Thread Sonny Rao

On Wed, Aug 10, 2016 at 10:37 AM, Jann Horn  wrote:
> On Wed, Aug 10, 2016 at 10:23:53AM -0700, Sonny Rao wrote:
>> On Tue, Aug 9, 2016 at 2:01 PM, Robert Foss  
>> wrote:
>> >
>> >
>> > On 2016-08-09 03:24 PM, Jann Horn wrote:
>> >>
>> >> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.f...@collabora.com wrote:
>> >>>
>> >>> From: Sonny Rao 
>> >>>
>> >>> This is based on earlier work by Thiago Goncales. It implements a new
>> >>> per process proc file which summarizes the contents of the smaps file
>> >>> but doesn't display any addresses.  It gives more detailed information
>> >>> than statm like the PSS (proprotional set size).  It differs from the
>> >>> original implementation in that it doesn't use the full blown set of
>> >>> seq operations, uses a different termination condition, and doesn't
>> >>> displayed "Locked" as that was broken on the original implemenation.
>> >>>
>> >>> This new proc file provides information faster than parsing the
>> >>> potentially
>> >>> huge smaps file.
>> >>>
>> >>> Signed-off-by: Sonny Rao 
>> >>>
>> >>> Tested-by: Robert Foss 
>> >>> Signed-off-by: Robert Foss 
>> >>
>> >>
>> >>
>> >>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>> >>> +{
>> >>> +   struct proc_maps_private *priv = m->private;
>> >>> +   struct mm_struct *mm;
>> >>> +   struct vm_area_struct *vma;
>> >>> +   struct mem_size_stats *mss_sum = priv->mss;
>> >>> +
>> >>> +   /* reference to priv->task already taken */
>> >>> +   /* but need to get the mm here because */
>> >>> +   /* task could be in the process of exiting */
>> >>
>> >>
>> >> Can you please elaborate on this? My understanding here is that you
>> >> intend for the caller to be able to repeatedly read the same totmaps
>> >> file with pread() and still see updated information after the target
>> >> process has called execve() and be able to detect process death
>> >> (instead of simply seeing stale values). Is that accurate?
>> >>
>> >> I would prefer it if you could grab a reference to the mm_struct
>> >> directly at open time.
>> >
>> >
>> > Sonny, do you know more about the above comment?
>>
>> I think right now the file gets re-opened every time, but the mode
>> where the file is opened once and repeatedly read is interesting
>> because it avoids having to open the file again and again.
>>
>> I guess you could end up with a wierd situation where you don't read
>> the entire contents of the file in open call to read() and you might
>> get inconsistent data across the different statistics?
>
> If the file is read in two chunks, totmaps_proc_show is only called
> once. The patch specifies seq_read as read handler. Have a look at its
> definition. As long as you don't read from the same seq file in
> parallel or seek around in it, simple sequential reads will not
> re-invoke the show() method for data that has already been formatted.
> For partially consumed data, the kernel buffers the rest until someone
> reads it or seeks to another offset.

Ok that's good.  If the consumer were using pread() though, would that
look like a seek?

Re: [PACTH v1] mm, proc: Implement /proc//totmaps

2016-08-10 Thread Sonny Rao

On Tue, Aug 9, 2016 at 2:01 PM, Robert Foss  wrote:
>
>
> On 2016-08-09 03:24 PM, Jann Horn wrote:
>>
>> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.f...@collabora.com wrote:
>>>
>>> From: Sonny Rao 
>>>
>>> This is based on earlier work by Thiago Goncales. It implements a new
>>> per process proc file which summarizes the contents of the smaps file
>>> but doesn't display any addresses.  It gives more detailed information
>>> than statm like the PSS (proprotional set size).  It differs from the
>>> original implementation in that it doesn't use the full blown set of
>>> seq operations, uses a different termination condition, and doesn't
>>> displayed "Locked" as that was broken on the original implemenation.
>>>
>>> This new proc file provides information faster than parsing the
>>> potentially
>>> huge smaps file.
>>>
>>> Signed-off-by: Sonny Rao 
>>>
>>> Tested-by: Robert Foss 
>>> Signed-off-by: Robert Foss 
>>
>>
>>
>>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>>> +{
>>> +   struct proc_maps_private *priv = m->private;
>>> +   struct mm_struct *mm;
>>> +   struct vm_area_struct *vma;
>>> +   struct mem_size_stats *mss_sum = priv->mss;
>>> +
>>> +   /* reference to priv->task already taken */
>>> +   /* but need to get the mm here because */
>>> +   /* task could be in the process of exiting */
>>
>>
>> Can you please elaborate on this? My understanding here is that you
>> intend for the caller to be able to repeatedly read the same totmaps
>> file with pread() and still see updated information after the target
>> process has called execve() and be able to detect process death
>> (instead of simply seeing stale values). Is that accurate?
>>
>> I would prefer it if you could grab a reference to the mm_struct
>> directly at open time.
>
>
> Sonny, do you know more about the above comment?

I think right now the file gets re-opened every time, but the mode
where the file is opened once and repeatedly read is interesting
because it avoids having to open the file again and again.

I guess you could end up with a wierd situation where you don't read
the entire contents of the file in open call to read() and you might
get inconsistent data across the different statistics?

>
>>
>>
>>> +   mm = get_task_mm(priv->task);
>>> +   if (!mm || IS_ERR(mm))
>>> +   return -EINVAL;
>>
>>
>> get_task_mm() doesn't return error codes, and all other callers just
>> check whether the return value is NULL.
>>
>
> I'll have that fixed in v2, thanks for spotting it!
>
>
>>
>>> +   down_read(&mm->mmap_sem);
>>> +   hold_task_mempolicy(priv);
>>> +
>>> +   for (vma = mm->mmap; vma != priv->tail_vma; vma = vma->vm_next) {
>>> +   struct mem_size_stats mss;
>>> +   struct mm_walk smaps_walk = {
>>> +   .pmd_entry = smaps_pte_range,
>>> +   .mm = vma->vm_mm,
>>> +   .private = &mss,
>>> +   };
>>> +
>>> +   if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
>>> +   memset(&mss, 0, sizeof(mss));
>>> +   walk_page_vma(vma, &smaps_walk);
>>> +   add_smaps_sum(&mss, mss_sum);
>>> +   }
>>> +   }
>>
>>
>> E... what? You accumulate values from mem_size_stats items into a
>> struct mss_sum that is associated with the struct file? So when you
>> read the file the second time, you get the old values plus the new ones?
>> And when you read the file in parallel, you get inconsistent values?
>>
>> For most files in procfs, the behavior is that you can just call
>> pread(fd, buf, sizeof(buf), 0) on the same fd again and again, giving
>> you the current values every time, without mutating state. I strongly
>> recommend that you get rid of priv->mss and just accumulate the state
>> in a local variable (maybe one on the stack).
>
>
> So a simple "static struct mem_size_stats" in totmaps_proc_show() would be a
> better solution?
>
>>
>>
>>> @@ -836,6 +911,50 @@ static int tid_smaps_open(struct inode *inode,
>>> struct file *file)
>>>

Re: [PACTH v1] mm, proc: Implement /proc//totmaps

2016-08-09 Thread Sonny Rao

On Tue, Aug 9, 2016 at 12:16 PM, Konstantin Khlebnikov  wrote:
>
> On Tue, Aug 9, 2016 at 7:05 PM,   wrote:
> > From: Sonny Rao 
> >
> > This is based on earlier work by Thiago Goncales. It implements a new
> > per process proc file which summarizes the contents of the smaps file
> > but doesn't display any addresses.  It gives more detailed information
> > than statm like the PSS (proprotional set size).  It differs from the
> > original implementation in that it doesn't use the full blown set of
> > seq operations, uses a different termination condition, and doesn't
> > displayed "Locked" as that was broken on the original implemenation.
> >
> > This new proc file provides information faster than parsing the potentially
> > huge smaps file.
>
> What statistics do you really need?

PSS (Proportional Set Size) and related accounting of shared pages
(swap could be shared) is where the existing summaries of memory usage
are cumbersome.

>
>
> I think, performance and flexibility issues could be really solved only by new
> syscall for querying memory statistics for address range in any process:
> process_vm_stat() or some kind of pumped fincore() for /proc/$pid/mem


That would be a good long term solution if people want similarly
complicated statistics without having to iterate through current
interfaces.
I mentioned monitoring before but I'll add that Proportional Set size,
Unique Set Size, Swap are per process are also useful because they
help us make better decisions about what processes need to be
throttled or gracefully killed.

>
> >
> > Signed-off-by: Sonny Rao 
> >
> > Tested-by: Robert Foss 
> > Signed-off-by: Robert Foss 
> >
> > ---
> >  fs/proc/base.c |   1 +
> >  fs/proc/internal.h |   4 ++
> >  fs/proc/task_mmu.c | 126 
> > +
> >  3 files changed, 131 insertions(+)
> >
> > diff --git a/fs/proc/base.c b/fs/proc/base.c
> > index a11eb71..de3acdf 100644
> > --- a/fs/proc/base.c
> > +++ b/fs/proc/base.c
> > @@ -2855,6 +2855,7 @@ static const struct pid_entry tgid_base_stuff[] = {
> > REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
> > REG("smaps",  S_IRUGO, proc_pid_smaps_operations),
> > REG("pagemap",S_IRUSR, proc_pagemap_operations),
> > +   REG("totmaps",S_IRUGO, proc_totmaps_operations),
> >  #endif
> >  #ifdef CONFIG_SECURITY
> > DIR("attr",   S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, 
> > proc_attr_dir_operations),
> > diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> > index aa27810..6f3540f 100644
> > --- a/fs/proc/internal.h
> > +++ b/fs/proc/internal.h
> > @@ -58,6 +58,9 @@ union proc_op {
> > struct task_struct *task);
> >  };
> >
> > +
> > +extern const struct file_operations proc_totmaps_operations;
> > +
> >  struct proc_inode {
> > struct pid *pid;
> > int fd;
> > @@ -281,6 +284,7 @@ struct proc_maps_private {
> > struct mm_struct *mm;
> >  #ifdef CONFIG_MMU
> > struct vm_area_struct *tail_vma;
> > +   struct mem_size_stats *mss;
> >  #endif
> >  #ifdef CONFIG_NUMA
> > struct mempolicy *task_mempolicy;
> > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > index 4648c7f..b61873e 100644
> > --- a/fs/proc/task_mmu.c
> > +++ b/fs/proc/task_mmu.c
> > @@ -802,6 +802,81 @@ static int show_smap(struct seq_file *m, void *v, int 
> > is_pid)
> > return 0;
> >  }
> >
> > +static void add_smaps_sum(struct mem_size_stats *mss,
> > +   struct mem_size_stats *mss_sum)
> > +{
> > +   mss_sum->resident += mss->resident;
> > +   mss_sum->pss += mss->pss;
> > +   mss_sum->shared_clean += mss->shared_clean;
> > +   mss_sum->shared_dirty += mss->shared_dirty;
> > +   mss_sum->private_clean += mss->private_clean;
> > +   mss_sum->private_dirty += mss->private_dirty;
> > +   mss_sum->referenced += mss->referenced;
> > +   mss_sum->anonymous += mss->anonymous;
> > +   mss_sum->anonymous_thp += mss->anonymous_thp;
> > +   mss_sum->swap += mss->swap;
> > +}
> > +
> > +static int totmaps_proc_show(struct seq_file *m, void *data)
> > +{
> > +   struct proc_maps_private *priv = m->private;
> > +   struct mm_struct *mm;
> > +   struct vm_area_struct *

Re: [PACTH v1] mm, proc: Implement /proc//totmaps

2016-08-09 Thread Sonny Rao

On Tue, Aug 9, 2016 at 9:58 AM, Alexey Dobriyan  wrote:
>
> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.f...@collabora.com wrote:
> > From: Sonny Rao 
> >
> > This is based on earlier work by Thiago Goncales. It implements a new
> > per process proc file which summarizes the contents of the smaps file
> > but doesn't display any addresses.  It gives more detailed information
> > than statm like the PSS (proprotional set size).  It differs from the
> > original implementation in that it doesn't use the full blown set of
> > seq operations, uses a different termination condition, and doesn't
> > displayed "Locked" as that was broken on the original implemenation.
> >
> > This new proc file provides information faster than parsing the potentially
> > huge smaps file.
>
> You can "parse" /proc/*/pagemap . RSS, swap are there.


/proc/*pagemap is generally restricted and I don't believe it would
quickly give PSS.

>
> So which ones do you really need?

PSS and Swap are the most important.  RSS isn't precise enough because
it counts shared pages fully, and there tends to be a lot of sharing.

> Why the separate anon hugepages and anon regular pages?

I'm not sure if it's necessary, but that's how it's broken out in smaps.

>
> > + seq_printf(m,
> > +"Rss:%8lu kB\n"
> > +"Pss:%8lu kB\n"
> > +"Shared_Clean:   %8lu kB\n"
> > +"Shared_Dirty:   %8lu kB\n"
> > +"Private_Clean:  %8lu kB\n"
> > +"Private_Dirty:  %8lu kB\n"
> > +"Referenced: %8lu kB\n"
> > +"Anonymous:  %8lu kB\n"
> > +"AnonHugePages:  %8lu kB\n"
> > +"Swap:   %8lu kB\n",
> > +mss_sum->resident >> 10,
> > +(unsigned long)(mss_sum->pss >> (10 + PSS_SHIFT)),
> > +mss_sum->shared_clean  >> 10,
> > +mss_sum->shared_dirty  >> 10,
> > +mss_sum->private_clean >> 10,
> > +mss_sum->private_dirty >> 10,
> > +mss_sum->referenced >> 10,
> > +mss_sum->anonymous >> 10,
> > +mss_sum->anonymous_thp >> 10,
> > +mss_sum->swap >> 10);

Re: [PACTH v1] mm, proc: Implement /proc//totmaps

2016-08-09 Thread Sonny Rao

On Tue, Aug 9, 2016 at 9:29 AM, Mateusz Guzik  wrote:
> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.f...@collabora.com wrote:
>> From: Sonny Rao 
>>
>> This is based on earlier work by Thiago Goncales. It implements a new
>> per process proc file which summarizes the contents of the smaps file
>> but doesn't display any addresses.  It gives more detailed information
>> than statm like the PSS (proprotional set size).  It differs from the
>> original implementation in that it doesn't use the full blown set of
>> seq operations, uses a different termination condition, and doesn't
>> displayed "Locked" as that was broken on the original implemenation.
>>
>> This new proc file provides information faster than parsing the potentially
>> huge smaps file.
>
> I have no idea about usefulness of this.

I can comment about this.  The use case is to speed up monitoring of
memory consumption in environments where RSS isn't precise.

For example Chrome tends to many processes which have hundreds of VMAs
with a substantial amount of shared memory, and the error of using
RSS rather than PSS tends to be very large when looking at overall
memory consumption.  PSS isn't kept as a single number that's exported
like RSS, so to calculate PSS means having to parse a very large smaps
file.

This process is slow and has to be repeated for many processes, and we
found that the just act of doing the parsing was taking up a
significant amount of CPU time, so this patch is an attempt to make
that process cheaper.

>
> The patch is definitely buggy with respect to how it implements actual
> access to mm.
>
>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>> +{
>> + struct proc_maps_private *priv = m->private;
>> + struct mm_struct *mm;
>> + struct vm_area_struct *vma;
>> + struct mem_size_stats *mss_sum = priv->mss;
>> +
>> + /* reference to priv->task already taken */
>> + /* but need to get the mm here because */
>> + /* task could be in the process of exiting */
>> + mm = get_task_mm(priv->task);
>> + if (!mm || IS_ERR(mm))
>> + return -EINVAL;
>> +
>
> That's not how it's done in smaps.
>
>> +static int totmaps_open(struct inode *inode, struct file *file)
>> +{
>> + struct proc_maps_private *priv;
>> + int ret = -ENOMEM;
>> + priv = kzalloc(sizeof(*priv), GFP_KERNEL);
>> + if (priv) {
>> + priv->mss = kzalloc(sizeof(*priv->mss), GFP_KERNEL);
>> + if (!priv->mss)
>> + return -ENOMEM;
>
> Cases below explicitly kfree(priv). I can't remember whether the close
> routine gets called if this one fails. Either way, something is wrong
> here.
>
>> +
>> + /* we need to grab references to the task_struct */
>> + /* at open time, because there's a potential information */
>> + /* leak where the totmaps file is opened and held open */
>> + /* while the underlying pid to task mapping changes */
>> + /* underneath it */
>> + priv->task = get_pid_task(proc_pid(inode), PIDTYPE_PID);
>
> This performs no permission checks that I would see. If you take a look
> at smaps you will see the user ends up in proc_maps_open which performs
> proc_mem_open(inode, PTRACE_MODE_READ) and gets a mm from there.
>
>
>> + if (!priv->task) {
>> + kfree(priv->mss);
>> + kfree(priv);
>> + return -ESRCH;
>> + }
>> +
>> + ret = single_open(file, totmaps_proc_show, priv);
>> + if (ret) {
>> + put_task_struct(priv->task);
>> + kfree(priv->mss);
>> + kfree(priv);
>> + }
>> + }
>> + return ret;
>> +}
>> +
>
> --
> Mateusz Guzik

Re: [PATCH v2 3/5] Documentation: arm-pl330: add description of arm,pl330-broken-no-flushp

2015-08-31 Thread Sonny Rao

On Thu, Aug 27, 2015 at 5:38 PM, Shawn Lin  wrote:
> Signed-off-by: Shawn Lin 
> Reviewed-by: Doug Anderson 
> ---
>
> Changes in v2:
> - add Reviewed-by: Doug Anderson 
> - reorder this patch before the usage of the quirk
>
> Changes in v1:
> - rename broken-no-flushp to "arm,pl330-broken-no-flushp" suggested
>   by Krzysztof.
>
>  Documentation/devicetree/bindings/dma/arm-pl330.txt | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/Documentation/devicetree/bindings/dma/arm-pl330.txt 
> b/Documentation/devicetree/bindings/dma/arm-pl330.txt
> index 2675658..db7e226 100644
> --- a/Documentation/devicetree/bindings/dma/arm-pl330.txt
> +++ b/Documentation/devicetree/bindings/dma/arm-pl330.txt
> @@ -15,6 +15,7 @@ Optional properties:
>  cells in the dmas property of client device.
>- dma-channels: contains the total number of DMA channels supported by the 
> DMAC
>- dma-requests: contains the total number of DMA requests supported by the 
> DMAC
> +  - arm,pl330-broken-no-flushp: quirk for avoiding to execute DMAFLUSHP
>
>  Example:
>
> --
> 2.3.7

Reviewed-by: Sonny Rao 

>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 1/5] DMA: pl330: support burst mode for dev-to-mem and mem-to-dev transmit

2015-08-31 Thread Sonny Rao

On Thu, Aug 27, 2015 at 5:37 PM, Shawn Lin  wrote:
> From: Boojin Kim 
>
> This patch adds to support burst mode for dev-to-mem and
> mem-to-dev transmit.
>
> Signed-off-by: Boojin Kim 
> Signed-off-by: Addy Ke 
> Signed-off-by: Shawn Lin 
> cc: Heiko Stuebner 
> cc: Doug Anderson 
> cc: Olof Johansson 
>
> ---
>
> Changes in v2:
> - amend the author
> - amend Olof's mail address
>
> Changes in v1:
> - rename broken-no-flushp to "arm,pl330-broken-no-flushp" suggested
>   by Krzysztof.
> - add From original author.
> - remove Sunny's tag
>
>  drivers/dma/pl330.c | 18 --
>  1 file changed, 12 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
> index ecab4ea0..0d544d2 100644
> --- a/drivers/dma/pl330.c
> +++ b/drivers/dma/pl330.c
> @@ -1141,10 +1141,13 @@ static inline int _ldst_devtomem(unsigned dry_run, u8 
> buf[],
> const struct _xfer_spec *pxs, int cyc)
>  {
> int off = 0;
> +   enum pl330_cond cond;
> +
> +   cond = (pxs->desc->rqcfg.brst_len == 1) ? SINGLE : BURST;
>
> while (cyc--) {
> -   off += _emit_WFP(dry_run, &buf[off], SINGLE, pxs->desc->peri);
> -   off += _emit_LDP(dry_run, &buf[off], SINGLE, pxs->desc->peri);
> +   off += _emit_WFP(dry_run, &buf[off], cond, pxs->desc->peri);
> +   off += _emit_LDP(dry_run, &buf[off], cond, pxs->desc->peri);
> off += _emit_ST(dry_run, &buf[off], ALWAYS);
> off += _emit_FLUSHP(dry_run, &buf[off], pxs->desc->peri);
> }
> @@ -1156,11 +1159,14 @@ static inline int _ldst_memtodev(unsigned dry_run, u8 
> buf[],
> const struct _xfer_spec *pxs, int cyc)
>  {
> int off = 0;
> +   enum pl330_cond cond;
> +
> +   cond = (pxs->desc->rqcfg.brst_len == 1) ? SINGLE : BURST;
>
> while (cyc--) {
> -   off += _emit_WFP(dry_run, &buf[off], SINGLE, pxs->desc->peri);
> +   off += _emit_WFP(dry_run, &buf[off], cond, pxs->desc->peri);
> off += _emit_LD(dry_run, &buf[off], ALWAYS);
> -   off += _emit_STP(dry_run, &buf[off], SINGLE, pxs->desc->peri);
> +   off += _emit_STP(dry_run, &buf[off], cond, pxs->desc->peri);
> off += _emit_FLUSHP(dry_run, &buf[off], pxs->desc->peri);
> }
>
> @@ -2557,7 +2563,7 @@ static struct dma_async_tx_descriptor 
> *pl330_prep_dma_cyclic(
>
> desc->rqtype = direction;
> desc->rqcfg.brst_size = pch->burst_sz;
> -   desc->rqcfg.brst_len = 1;
> +   desc->rqcfg.brst_len = pch->burst_len;
> desc->bytes_requested = period_len;
> fill_px(&desc->px, dst, src, period_len);
>
> @@ -2702,7 +2708,7 @@ pl330_prep_slave_sg(struct dma_chan *chan, struct 
> scatterlist *sgl,
> }
>
> desc->rqcfg.brst_size = pch->burst_sz;
> -   desc->rqcfg.brst_len = 1;
> +   desc->rqcfg.brst_len = pch->burst_len;
> desc->rqtype = direction;
> desc->bytes_requested = sg_dma_len(sg);
> }
> --
> 2.3.7

Reviewed-by: Sonny Rao 

>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/5] DMA: pl330: add quirk for broken no flushp

2015-08-31 Thread Sonny Rao

off += _ldst_memtodev(dry_run, &buf[off], pxs, cyc);
> +   off += _ldst_memtodev(pl330, dry_run, &buf[off], pxs, cyc);
> break;
> case DMA_DEV_TO_MEM:
> -   off += _ldst_devtomem(dry_run, &buf[off], pxs, cyc);
> +   off += _ldst_devtomem(pl330, dry_run, &buf[off], pxs, cyc);
> break;
> case DMA_MEM_TO_MEM:
> off += _ldst_memtomem(dry_run, &buf[off], pxs, cyc);
> @@ -1197,7 +1225,7 @@ static int _bursts(unsigned dry_run, u8 buf[],
>  }
>
>  /* Returns bytes consumed and updates bursts */
> -static inline int _loop(unsigned dry_run, u8 buf[],
> +static inline int _loop(struct pl330_dmac *pl330, unsigned dry_run, u8 buf[],
> unsigned long *bursts, const struct _xfer_spec *pxs)
>  {
> int cyc, cycmax, szlp, szlpend, szbrst, off;
> @@ -1220,7 +1248,7 @@ static inline int _loop(unsigned dry_run, u8 buf[],
> }
>
> szlp = _emit_LP(1, buf, 0, 0);
> -   szbrst = _bursts(1, buf, pxs, 1);
> +   szbrst = _bursts(pl330, 1, buf, pxs, 1);
>
> lpend.cond = ALWAYS;
> lpend.forever = false;
> @@ -1252,7 +1280,7 @@ static inline int _loop(unsigned dry_run, u8 buf[],
> off += _emit_LP(dry_run, &buf[off], 1, lcnt1);
> ljmp1 = off;
>
> -   off += _bursts(dry_run, &buf[off], pxs, cyc);
> +   off += _bursts(pl330, dry_run, &buf[off], pxs, cyc);
>
> lpend.cond = ALWAYS;
> lpend.forever = false;
> @@ -1275,8 +1303,9 @@ static inline int _loop(unsigned dry_run, u8 buf[],
> return off;
>  }
>
> -static inline int _setup_loops(unsigned dry_run, u8 buf[],
> -   const struct _xfer_spec *pxs)
> +static inline int _setup_loops(struct pl330_dmac *pl330,
> +  unsigned dry_run, u8 buf[],
> +  const struct _xfer_spec *pxs)
>  {
> struct pl330_xfer *x = &pxs->desc->px;
> u32 ccr = pxs->ccr;
> @@ -1285,15 +1314,16 @@ static inline int _setup_loops(unsigned dry_run, u8 
> buf[],
>
> while (bursts) {
> c = bursts;
> -   off += _loop(dry_run, &buf[off], &c, pxs);
> +   off += _loop(pl330, dry_run, &buf[off], &c, pxs);
> bursts -= c;
> }
>
> return off;
>  }
>
> -static inline int _setup_xfer(unsigned dry_run, u8 buf[],
> -   const struct _xfer_spec *pxs)
> +static inline int _setup_xfer(struct pl330_dmac *pl330,
> + unsigned dry_run, u8 buf[],
> + const struct _xfer_spec *pxs)
>  {
> struct pl330_xfer *x = &pxs->desc->px;
> int off = 0;
> @@ -1304,7 +1334,7 @@ static inline int _setup_xfer(unsigned dry_run, u8 
> buf[],
> off += _emit_MOV(dry_run, &buf[off], DAR, x->dst_addr);
>
> /* Setup Loop(s) */
> -   off += _setup_loops(dry_run, &buf[off], pxs);
> +   off += _setup_loops(pl330, dry_run, &buf[off], pxs);
>
> return off;
>  }
> @@ -1313,8 +1343,9 @@ static inline int _setup_xfer(unsigned dry_run, u8 
> buf[],
>   * A req is a sequence of one or more xfer units.
>   * Returns the number of bytes taken to setup the MC for the req.
>   */
> -static int _setup_req(unsigned dry_run, struct pl330_thread *thrd,
> -   unsigned index, struct _xfer_spec *pxs)
> +static int _setup_req(struct pl330_dmac *pl330, unsigned dry_run,
> + struct pl330_thread *thrd, unsigned index,
> + struct _xfer_spec *pxs)
>  {
> struct _pl330_req *req = &thrd->req[index];
> struct pl330_xfer *x;
> @@ -1331,7 +1362,7 @@ static int _setup_req(unsigned dry_run, struct 
> pl330_thread *thrd,
> if (x->bytes % (BRST_SIZE(pxs->ccr) * BRST_LEN(pxs->ccr)))
> return -EINVAL;
>
> -   off += _setup_xfer(dry_run, &buf[off], pxs);
> +   off += _setup_xfer(pl330, dry_run, &buf[off], pxs);
>
> /* DMASEV peripheral/event */
> off += _emit_SEV(dry_run, &buf[off], thrd->ev);
> @@ -1425,7 +1456,7 @@ static int pl330_submit_req(struct pl330_thread *thrd,
> xs.desc = desc;
>
> /* First dry run to check if req is acceptable */
> -   ret = _setup_req(1, thrd, idx, &xs);
> +   ret = _setup_req(pl330, 1, thrd, idx, &xs);
> if (ret < 0)
> goto xfer_exit;
>
> @@ -1439,7 +1470,7 @@ static int pl330_submit_req(struct pl330_thread *thrd,
> /* Hook the request */
> thrd->lstenq = idx;
&

Re: [PATCH v2 0/5] Fix broken DMAFLUSHP on Rockchips platform

2015-08-31 Thread Sonny Rao

On Thu, Aug 27, 2015 at 5:36 PM, Shawn Lin  wrote:
>
> The purpose of the DMAFLUSHP instruction:
> - Tell the peripheral to clear its status and control registers.
> - Send a message to the peripheral to resend its level status.
>
> There are 3 timings described in PL330 Technical Reference Manual:
> - Timing 1: Burst request, can work well without DMAFLUSHP.
> - Timing 2: Single and burst request, DMAC will ignore the single
> transfer request. This timing happens if there are single
> and burst request.
> - Timing 3: Single transfers for a burst request, DMAC should signals
> datype to request the peripheral to flush the contents of
> any control registers. This timing happens if there is
> not enough MFIFO to places the burst data.
>
> A peripheral may signal a DMA request during the execution of
> DMAFLUSHP instruction, that cause DMA request being ignored by DMAC.
>
> But DMAC and all peripherals on RK3X SoCs DO NOT support DMAFLUSHP.
> It can't send a message to the peripheral to resend DMA request,
> and the peripheral can't acknowledge a flush request from DMAC.
> So all DMA requests should NOT be ignored by DMAC, and DMAC will not
> notify the peripheral to flush.
>
> To fix this problem, we need:
> - Do NOT execute DMAFLUSHP instruction.
> - Timing 2 and timing 3 should not happen.
>
> Because on RK3X SoCs, there are 6 or below  channels and 32 MFIFO depth
> for DMAC_BUS, and 8 channels and 64 MFIFO depth for DMAC_PERI, it is
> impossible to hit the timing 3 if burst length is equal or less than 4.

Fixing this issue also requires changes to drivers, so it would be
nice if you put those changes into the same patchset.
Otherwise someone may apply this series and expect things to work but
they will still be broken. Specifically the peripherals should be
setting their burst sizes for their DMA requests low enough to avoid
needing the working DMAFLUSHP instruction.

Also, I remember we ran into an issue when we tried using burst length
of 4 with the i2s device on RK3288 because we could get requests that
either weren't aligned or a multiple of 4 sizes and some transfers
would just fail, so we ended up using a burst size of 1.  I recommend
if we aren't sure about size or alignment for a particular peripheral,
a burst size of 1 is safest.  For something like a block device, I
think we can use the larger size bursts.  That's another reason to
include the driver fixes in the series, just so we get it right,
thanks.

>
> Since the request type signal by the peripheral can only be set by
> software. We can set Rockchip Soc's GRF_PERIDMAC_CON0[2:1] to select single
> or burst request, if it is set b01,  all of the peripharals will signal a 
> brust
> request. So the timing 2 will not happen, too.
>
> So DMAC on RK3X can support single or burst transfer, but can't support
> mixed transfer.
>
> Because burst transfer is more efficient than single transfer, this is
> confirmed by our ASIC team, who strongly suggest to use burst transfer.
> And this is confirmed by Addy's test on RK3288-Pink2 board, the speed of
> spi flash burst transfer will increase about two times than single transfer.
> Also, I have tested dw_mmc with pl330 on RK3188 platform to double confirm
> the result. That means burst transfer is reansonable.
>
> So we need a quirk not to execute DMAFLUSHP instruction and to use burst
> transfer.
>
> Note:
> - The Rockchip Soc default value of GRF_PERIDMAC_CON0[2:1] is b01. To
>   support brust transfer, these bits should not be changed in bootloader.
>
>
> Changes in v2:
> - amend the author
> - reorder the patches suggested by Doug
> - add Reviewed-by: Doug Anderson  for
>   rk3288.dtsi patch and arm-pl330.txt patch
> - amend Olof's mail address
>
> Changes in v1:
> - rename broken-no-flushp to "arm,pl330-broken-no-flushp" suggested
>   by Krzysztof.
> - add From original author.
> - remove Sunny's tag
>
> Addy Ke (2):
>   DMA: pl330: add quirk for broken no flushp
>   ARM: dts: Add arm,pl330-broken-no-flushp quirk for rk3288 platform
>
> Boojin Kim (1):
>   DMA: pl330: support burst mode for dev-to-mem and mem-to-dev transmit
>
> Shawn Lin (2):
>   Documentation: arm-pl330: add description of
> arm,pl330-broken-no-flushp
>   ARM: dts: Add arm,pl330-broken-no-flushp quirk for rk3xxx platform
>
>  .../devicetree/bindings/dma/arm-pl330.txt  |   1 +
>  arch/arm/boot/dts/rk3288.dtsi  |   3 +
>  arch/arm/boot/dts/rk3xxx.dtsi  |   3 +
>  drivers/dma/pl330.c| 101 
> +++--
>  4 files changed, 79 insertions(+), 29 deletions(-)
>
> --
> 2.3.7
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubs

Re: [PATCH 1/5] DMA: pl330: support burst mode for dev-to-mem and mem-to-dev transmit

2015-08-28 Thread Sonny Rao

On Thu, Aug 27, 2015 at 6:28 AM, Shawn Lin  wrote:
> 在 2015/8/27 20:57, Krzysztof Kozlowski 写道:
>>
>> 2015-08-27 17:48 GMT+09:00 Shawn Lin :
>>>
>>>
>>> This patch adds to support burst mode for dev-to-mem and
>>> mem-to-dev transmit.
>>>
>>> Signed-off-by: Boojin Kim 
>>> Signed-off-by: Addy Ke 
>>> Signed-off-by: Shawn Lin 
>>> cc: Heiko Stuebner 
>>> cc: Doug Anderson 
>>> cc: Olof Johansson 
>>> Reviewed-and-tested-by: Sonny Rao 
>>
>>
>> For the entire patchset: I would prefer to see someone's
>> reviewed/tested tag in his response. Sending a version 1 of patchset
>> (regardless of Boojin Kim's work two years ago) with such tag could
>> mean anything. I cannot verify it easily (unless digging somewhere...
>> or asking people). You could add for example: Reviewed-by Santa Claus.
>> Should I sent a letter to him asking for confirmation? :)
>>
>
> :) yes, you are right. I should comply with the rule, even if the patchest
> had been reviewed or tested by someone on another tree.

Hi, yeah I reviewed on a different tree, so you shouldn't put that tag
here, thanks for removing it.
I can re-review if you'd like.

>
>
>> More seriously - reviewed-by is a statement (please look at
>> Documentation/SubmittingPatches) and you cannot force someone to make
>> that statement. He must make such statement on his own.
>>
>> That's all from my side since I don't feel skilled enough to review the
>> code.
>>
>> Best regards,
>> Krzysztof
>>
>>>
>>> ---
>>>
>>>   drivers/dma/pl330.c | 18 --
>>>   1 file changed, 12 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
>>> index ecab4ea0..0d544d2 100644
>>> --- a/drivers/dma/pl330.c
>>> +++ b/drivers/dma/pl330.c
>>> @@ -1141,10 +1141,13 @@ static inline int _ldst_devtomem(unsigned
>>> dry_run, u8 buf[],
>>>  const struct _xfer_spec *pxs, int cyc)
>>>   {
>>>  int off = 0;
>>> +   enum pl330_cond cond;
>>> +
>>> +   cond = (pxs->desc->rqcfg.brst_len == 1) ? SINGLE : BURST;
>>>
>>>  while (cyc--) {
>>> -   off += _emit_WFP(dry_run, &buf[off], SINGLE,
>>> pxs->desc->peri);
>>> -   off += _emit_LDP(dry_run, &buf[off], SINGLE,
>>> pxs->desc->peri);
>>> +   off += _emit_WFP(dry_run, &buf[off], cond,
>>> pxs->desc->peri);
>>> +   off += _emit_LDP(dry_run, &buf[off], cond,
>>> pxs->desc->peri);
>>>  off += _emit_ST(dry_run, &buf[off], ALWAYS);
>>>  off += _emit_FLUSHP(dry_run, &buf[off],
>>> pxs->desc->peri);
>>>  }
>>> @@ -1156,11 +1159,14 @@ static inline int _ldst_memtodev(unsigned
>>> dry_run, u8 buf[],
>>>  const struct _xfer_spec *pxs, int cyc)
>>>   {
>>>  int off = 0;
>>> +   enum pl330_cond cond;
>>> +
>>> +   cond = (pxs->desc->rqcfg.brst_len == 1) ? SINGLE : BURST;
>>>
>>>  while (cyc--) {
>>> -   off += _emit_WFP(dry_run, &buf[off], SINGLE,
>>> pxs->desc->peri);
>>> +   off += _emit_WFP(dry_run, &buf[off], cond,
>>> pxs->desc->peri);
>>>  off += _emit_LD(dry_run, &buf[off], ALWAYS);
>>> -   off += _emit_STP(dry_run, &buf[off], SINGLE,
>>> pxs->desc->peri);
>>> +   off += _emit_STP(dry_run, &buf[off], cond,
>>> pxs->desc->peri);
>>>  off += _emit_FLUSHP(dry_run, &buf[off],
>>> pxs->desc->peri);
>>>  }
>>>
>>> @@ -2557,7 +2563,7 @@ static struct dma_async_tx_descriptor
>>> *pl330_prep_dma_cyclic(
>>>
>>>  desc->rqtype = direction;
>>>  desc->rqcfg.brst_size = pch->burst_sz;
>>> -   desc->rqcfg.brst_len = 1;
>>> +   desc->rqcfg.brst_len = pch->burst_len;
>>>  desc->bytes_requested = period_len;
>>>  fill_px(&desc->px, dst, src, period_len);
>>>
>>> @@ -2702,7 +2708,7 @@ pl330_prep_slave_sg(struct dma_chan *chan, struct
>>> scatterlist *sgl,
>>>

Re: [PATCH] perf/x86/intel/uncore: add Broadwell-U uncore IMC PMU support

2015-04-23 Thread Sonny Rao

On Wed, Apr 22, 2015 at 11:56 PM, Stephane Eranian  wrote:
>
> This patch enables the uncore Memory Controller (IMC) PMU support
> for Intel Broadwell-U (Model 61) mobile processors.
> The IMC PMU enables measuring memory bandwidth.
>
> To use with perf:
> $ perf stat -a -I 1000 -e uncore_imc/data_reads/,uncore_imc/data_writes/ 
> sleep 10
>
> Signed-off-by: Stephane Eranian 

Tested-by: Sonny Rao 

> ---
>
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
> b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> index c635b8b..a03f964 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> @@ -922,6 +922,9 @@ static int __init uncore_pci_init(void)
> case 69: /* Haswell Celeron */
> ret = hsw_uncore_pci_init();
> break;
> +   case 61: /* Broadwell */
> +   ret = bdw_uncore_pci_init();
> +   break;
> default:
> return 0;
> }
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
> b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
> index 6c8c1e7..06b0793 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
> @@ -326,6 +326,7 @@ extern struct event_constraint uncore_constraint_empty;
>  int snb_uncore_pci_init(void);
>  int ivb_uncore_pci_init(void);
>  int hsw_uncore_pci_init(void);
> +int bdw_uncore_pci_init(void);
>  void snb_uncore_cpu_init(void);
>  void nhm_uncore_cpu_init(void);
>
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
> b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
> index 0333d0b..0f768bf 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
> @@ -7,6 +7,7 @@
>  #define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
>  #define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
>  #define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
> +#define PCI_DEVICE_ID_INTEL_BDW_IMC0x1604
>
>  /* SNB event control */
>  #define SNB_UNC_CTL_EV_SEL_MASK0x00ff
> @@ -488,6 +489,14 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = 
> {
> { /* end: all zeroes */ },
>  };
>
> +static const struct pci_device_id bdw_uncore_pci_ids[] = {
> +   { /* IMC */
> +   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
> +   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
> +   },
> +   { /* end: all zeroes */ },
> +};
> +
>  static struct pci_driver snb_uncore_pci_driver = {
> .name   = "snb_uncore",
> .id_table   = snb_uncore_pci_ids,
> @@ -503,6 +512,11 @@ static struct pci_driver hsw_uncore_pci_driver = {
> .id_table   = hsw_uncore_pci_ids,
>  };
>
> +static struct pci_driver bdw_uncore_pci_driver = {
> +   .name   = "bdw_uncore",
> +   .id_table   = bdw_uncore_pci_ids,
> +};
> +
>  struct imc_uncore_pci_dev {
> __u32 pci_id;
> struct pci_driver *driver;
> @@ -516,6 +530,7 @@ static const struct imc_uncore_pci_dev 
> desktop_imc_pci_ids[] = {
> IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd 
> Gen Core processor */
> IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),/* 4th Gen Core 
> Processor */
> IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT 
> Mobile Processor */
> +   IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver),/* 5th Gen Core U */
> {  /* end marker */ }
>  };
>
> @@ -563,6 +578,11 @@ int hsw_uncore_pci_init(void)
> return imc_uncore_pci_init();
>  }
>
> +int bdw_uncore_pci_init(void)
> +{
> +   return imc_uncore_pci_init();
> +}
> +
>  /* end of Sandy Bridge uncore support */
>
>  /* Nehalem uncore support */
> --
> 2.1.0
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:perf/urgent] perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver

2015-04-22 Thread tip-bot for Sonny Rao

Commit-ID:  0140e6141e4f1d4b15fb469e6912b0e71b7d1cc2
Gitweb: http://git.kernel.org/tip/0140e6141e4f1d4b15fb469e6912b0e71b7d1cc2
Author: Sonny Rao 
AuthorDate: Tue, 21 Apr 2015 12:33:11 -0700
Committer:  Ingo Molnar 
CommitDate: Wed, 22 Apr 2015 08:29:19 +0200

perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver

This keeps all the related PCI IDs together in the driver where
they are used.

Signed-off-by: Sonny Rao 
Acked-by: Bjorn Helgaas 
Cc: Arnaldo Carvalho de Melo 
Cc: Paul Mackerras 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Link: 
http://lkml.kernel.org/r/1429644791-25724-1-git-send-email-sonny...@chromium.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 6 +-
 include/linux/pci_ids.h   | 4 
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index ca75e70..4562e9e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,7 +1,11 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include "perf_event_intel_uncore.h"
 
-/* Uncore IMC PCI Id */
+/* Uncore IMC PCI IDs */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
 
 /* SNB event control */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index e63c02a..a593858 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2539,10 +2539,6 @@
 
 #define PCI_VENDOR_ID_INTEL0x8086
 #define PCI_DEVICE_ID_INTEL_EESSC  0x0008
-#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
-#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
-#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
-#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
 #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
 #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:perf/urgent] perf/x86/intel/uncore: Add support for Intel Haswell ULT (lower power Mobile Processor) IMC uncore PMUs

2015-04-22 Thread tip-bot for Sonny Rao

Commit-ID:  80bcffb376a6890dd7452b12c1ba032f8f24fef6
Gitweb: http://git.kernel.org/tip/80bcffb376a6890dd7452b12c1ba032f8f24fef6
Author: Sonny Rao 
AuthorDate: Mon, 20 Apr 2015 15:34:07 -0700
Committer:  Ingo Molnar 
CommitDate: Wed, 22 Apr 2015 08:27:43 +0200

perf/x86/intel/uncore: Add support for Intel Haswell ULT (lower power Mobile 
Processor) IMC uncore PMUs

This uncore is the same as the Haswell desktop part but uses a
different PCI ID.

Signed-off-by: Sonny Rao 
Cc: Arnaldo Carvalho de Melo 
Cc: Bjorn Helgaas 
Cc: Paul Mackerras 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Link: 
http://lkml.kernel.org/r/1429569247-16697-1-git-send-email-sonny...@chromium.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 3001015..ca75e70 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,6 +1,9 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include "perf_event_intel_uncore.h"
 
+/* Uncore IMC PCI Id */
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
+
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK0x00ff
 #define SNB_UNC_CTL_UMASK_MASK 0xff00
@@ -472,6 +475,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+   { /* IMC */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+   },
{ /* end: all zeroes */ },
 };
 
@@ -502,6 +509,7 @@ static const struct imc_uncore_pci_dev 
desktop_imc_pci_ids[] = {
IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver),/* 3rd Gen Core processor 
*/
IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen 
Core processor */
IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),/* 4th Gen Core Processor 
*/
+   IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile 
Processor */
{  /* end marker */ }
 };
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCHv2] perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver

2015-04-21 Thread Sonny Rao

This keeps all the related PCI IDs together in the driver where they
are used.

Signed-off-by: Sonny Rao 
Acked-by: Bjorn Helgaas 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 6 +-
 include/linux/pci_ids.h   | 4 
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index ca75e70..4562e9e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,7 +1,11 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include "perf_event_intel_uncore.h"
 
-/* Uncore IMC PCI Id */
+/* Uncore IMC PCI IDs */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
 
 /* SNB event control */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 38cff8f..2f7b9a4 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2541,10 +2541,6 @@
 
 #define PCI_VENDOR_ID_INTEL0x8086
 #define PCI_DEVICE_ID_INTEL_EESSC  0x0008
-#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
-#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
-#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
-#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
 #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
 #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver

2015-04-21 Thread Sonny Rao

On Tue, Apr 21, 2015 at 12:21 PM, Bjorn Helgaas  wrote:
> On Tue, Apr 21, 2015 at 2:09 PM, Sonny Rao  wrote:
>> This keeps all the related PCI IDs together in the driver where they
>> are used.
>>
>> Signed-off-by: Sonny Rao 
>
> Acked-by: Bjorn Helgaas 
>
>> ---
>>  arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 6 +-
>>  include/linux/pci_ids.h   | 4 
>>  2 files changed, 5 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
>> b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
>> index ca75e70..02c1a13 100644
>> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
>> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
>> @@ -1,7 +1,11 @@
>>  /* Nehalem/SandBridge/Haswell uncore support */
>>  #include "perf_event_intel_uncore.h"
>>
>> -/* Uncore IMC PCI Id */
>> +/* Uncore IMC PCI Ids */
>
> "IDs" would be more consistent.
>

Oops, will fix.

>> +#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
>> +#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
>> +#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
>> +#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
>>  #define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
>>
>>  /* SNB event control */
>> diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
>> index 38cff8f..2f7b9a4 100644
>> --- a/include/linux/pci_ids.h
>> +++ b/include/linux/pci_ids.h
>> @@ -2541,10 +2541,6 @@
>>
>>  #define PCI_VENDOR_ID_INTEL0x8086
>>  #define PCI_DEVICE_ID_INTEL_EESSC  0x0008
>> -#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
>> -#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
>> -#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
>> -#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
>>  #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
>>  #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
>>  #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
>> --
>> 2.1.2
>>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver

2015-04-21 Thread Sonny Rao

This keeps all the related PCI IDs together in the driver where they
are used.

Signed-off-by: Sonny Rao 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 6 +-
 include/linux/pci_ids.h   | 4 
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index ca75e70..02c1a13 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,7 +1,11 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include "perf_event_intel_uncore.h"
 
-/* Uncore IMC PCI Id */
+/* Uncore IMC PCI Ids */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
 
 /* SNB event control */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 38cff8f..2f7b9a4 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2541,10 +2541,6 @@
 
 #define PCI_VENDOR_ID_INTEL0x8086
 #define PCI_DEVICE_ID_INTEL_EESSC  0x0008
-#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
-#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
-#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
-#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
 #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
 #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:x86/urgent] perf/x86/intel/uncore: Add support for Intel Haswell ULT (lower power Mobile Processor) IMC uncore PMUs

2015-04-21 Thread tip-bot for Sonny Rao

Commit-ID:  5324e72e00012126101aee6f3e62977055a3b5ee
Gitweb: http://git.kernel.org/tip/5324e72e00012126101aee6f3e62977055a3b5ee
Author: Sonny Rao 
AuthorDate: Mon, 20 Apr 2015 15:34:07 -0700
Committer:  Ingo Molnar 
CommitDate: Tue, 21 Apr 2015 09:31:17 +0200

perf/x86/intel/uncore: Add support for Intel Haswell ULT (lower power Mobile 
Processor) IMC uncore PMUs

This uncore is the same as the Haswell desktop part but uses a
different PCI ID.

Signed-off-by: Sonny Rao 
Cc: Arnaldo Carvalho de Melo 
Cc: Bjorn Helgaas 
Cc: Paul Mackerras 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Link: 
http://lkml.kernel.org/r/1429569247-16697-1-git-send-email-sonny...@chromium.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 3001015..ca75e70 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,6 +1,9 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include "perf_event_intel_uncore.h"
 
+/* Uncore IMC PCI Id */
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
+
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK0x00ff
 #define SNB_UNC_CTL_UMASK_MASK 0xff00
@@ -472,6 +475,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+   { /* IMC */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+   },
{ /* end: all zeroes */ },
 };
 
@@ -502,6 +509,7 @@ static const struct imc_uncore_pci_dev 
desktop_imc_pci_ids[] = {
IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver),/* 3rd Gen Core processor 
*/
IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen 
Core processor */
IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),/* 4th Gen Core Processor 
*/
+   IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile 
Processor */
{  /* end marker */ }
 };
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCHv2] perf/x86/intel/uncore: add support for Haswell ULT IMC uncore

2015-04-20 Thread Sonny Rao

This uncore is the same as the Haswell desktop part but uses a
different PCI ID.

Signed-off-by: Sonny Rao 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 3001015..ca75e70 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,6 +1,9 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include "perf_event_intel_uncore.h"
 
+/* Uncore IMC PCI Id */
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
+
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK0x00ff
 #define SNB_UNC_CTL_UMASK_MASK 0xff00
@@ -472,6 +475,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+   { /* IMC */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+   },
{ /* end: all zeroes */ },
 };
 
@@ -502,6 +509,7 @@ static const struct imc_uncore_pci_dev 
desktop_imc_pci_ids[] = {
IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver),/* 3rd Gen Core processor 
*/
IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen 
Core processor */
IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),/* 4th Gen Core Processor 
*/
+   IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile 
Processor */
{  /* end marker */ }
 };
 
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] perf/x86/intel/uncore: add support for Haswell ULT IMC uncore

2015-04-20 Thread Sonny Rao

On Mon, Apr 20, 2015 at 12:34 PM, Bjorn Helgaas  wrote:
> On Mon, Apr 20, 2015 at 1:58 PM, Stephane Eranian  wrote:
>> On Mon, Apr 20, 2015 at 11:56 AM, Bjorn Helgaas  wrote:
>>>
>>> On Mon, Apr 20, 2015 at 1:42 PM, Sonny Rao  wrote:
>>> > This uncore is the same as the Haswell desktop part but uses a
>>> > different PCI ID.
>>> >
>>> > Signed-off-by: Sonny Rao 
>>> > ---
>>> >  arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 5 +
>>> >  include/linux/pci_ids.h   | 1 +
>>> >  2 files changed, 6 insertions(+)
>>> >
>>> > diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
>>> > b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
>>> > index 3001015..0bda6fc 100644
>>> > --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
>>> > +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
>>> > @@ -472,6 +472,10 @@ static const struct pci_device_id 
>>> > hsw_uncore_pci_ids[] = {
>>> > PCI_DEVICE(PCI_VENDOR_ID_INTEL, 
>>> > PCI_DEVICE_ID_INTEL_HSW_IMC),
>>> > .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
>>> > },
>>> > +   { /* IMC */
>>> > +   PCI_DEVICE(PCI_VENDOR_ID_INTEL, 
>>> > PCI_DEVICE_ID_INTEL_HSW_U_IMC),
>>> > +   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
>>> > +   },
>>> > { /* end: all zeroes */ },
>>> >  };
>>> >
>>> > @@ -502,6 +506,7 @@ static const struct imc_uncore_pci_dev 
>>> > desktop_imc_pci_ids[] = {
>>> > IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver),/* 3rd Gen Core 
>>> > processor */
>>> > IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 
>>> > v2/3rd Gen Core processor */
>>> > IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),/* 4th Gen Core 
>>> > Processor */
>>> > +   IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT 
>>> > Mobile Processor */
>>> > {  /* end marker */ }
>>> >  };
>>> >
>>> > diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
>>> > index 38cff8f..e5ae042 100644
>>> > --- a/include/linux/pci_ids.h
>>> > +++ b/include/linux/pci_ids.h
>>> > @@ -2545,6 +2545,7 @@
>>> >  #define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
>>> >  #define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
>>> >  #define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
>>> > +#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
>>>
>>> Please either use the 0x0a04 constant directly in
>>> perf_event_intel_uncore_snb.c, or explain why the #define should be
>>> here, e.g., maybe it will be used in multiple places.  See the comment
>>> at the top of pci_ids.h.
>>>
>> But then, the same reasoning would apply to the other 3 IMC defines,
>> wouldn't it?
>
> Yes.  But if we made a mistake in the past, that doesn't mean we
> should repeat it today.

Shall I post a patch moving the others as well?

>
>>> >  #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
>>> >  #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
>>> >  #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
>>> > --
>>> > 2.1.2
>>> >
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] perf/x86/intel/uncore: add support for Haswell ULT IMC uncore

2015-04-20 Thread Sonny Rao

This uncore is the same as the Haswell desktop part but uses a
different PCI ID.

Signed-off-by: Sonny Rao 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 5 +
 include/linux/pci_ids.h   | 1 +
 2 files changed, 6 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 3001015..0bda6fc 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -472,6 +472,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+   { /* IMC */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+   },
{ /* end: all zeroes */ },
 };
 
@@ -502,6 +506,7 @@ static const struct imc_uncore_pci_dev 
desktop_imc_pci_ids[] = {
IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver),/* 3rd Gen Core processor 
*/
IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen 
Core processor */
IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),/* 4th Gen Core Processor 
*/
+   IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile 
Processor */
{  /* end marker */ }
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 38cff8f..e5ae042 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2545,6 +2545,7 @@
 #define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
 #define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
 #define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
 #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
 #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
 #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] arm: dts: rk3288: Enable Cortex-A12 HW PMU events

2015-04-07 Thread Sonny Rao

This adds the dts node for the PMU with the correct PMUIRQ interrupts
for each core.

Signed-off-by: Sonny Rao 
---
 arch/arm/boot/dts/rk3288.dtsi | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi
index 165968d..8253abb 100644
--- a/arch/arm/boot/dts/rk3288.dtsi
+++ b/arch/arm/boot/dts/rk3288.dtsi
@@ -44,6 +44,14 @@
spi2 = &spi2;
};
 
+   arm-pmu {
+   compatible = "arm,cortex-a12-pmu";
+   interrupts = ,
+,
+,
+;
+   };
+
cpus {
#address-cells = <1>;
#size-cells = <0>;
-- 
2.2.0.rc0.207.ga3a616c

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] RTC: RK808: fix the rtc time reading issue

2015-01-14 Thread Sonny Rao

On Wed, Jan 14, 2015 at 10:36 AM, Doug Anderson  wrote:
> Sonny,
>
>> Chris, it looks like you swapped the set and the clear of this bit,
>> and you're relying on the fact that the i2c transaction takes a
>> certain amount of time after the RTC_GET_TIME BIT is set.   I'm not
>> sure how long it actually takes, but why not just put in a usleep()
>> for the minimum wait time?
>
> I think we are safe.
>
> At 400kHz (the max speed of this part) each bit can be transferred no
> faster than 2.5us.  In order to do a valid i2c transaction we need to
> _at least_ write the address of the device and the data onto the bus,
> which is 16 bits.  16 * 2.5us = 40us.  That's above the 31.25us
>
> Personally I think what Chris has is fine, with the comment.

Ok, I'm fine with that if we're sure it's slow enough.  Comment
explaining would certainly help.

>
> -Doug
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] RTC: RK808: fix the rtc time reading issue

2015-01-14 Thread Sonny Rao

On Tue, Jan 13, 2015 at 6:43 PM, Chris Zhong  wrote:
> After we set the GET_TIME bit, the rtc time couldn't be read immediately,
> we should wait up to 31.25 us, about one cycle of 32khz. Otherwise reading
> RTC time will return a old time. If clear the GET_TIME bit after setting,
> the time of i2c transfer certainly more than 31.25us.
>
> Signed-off-by: Chris Zhong 
>
> ---
>
>  drivers/rtc/rtc-rk808.c | 9 +++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/rtc/rtc-rk808.c b/drivers/rtc/rtc-rk808.c
> index df42257..8dae322 100644
> --- a/drivers/rtc/rtc-rk808.c
> +++ b/drivers/rtc/rtc-rk808.c
> @@ -67,15 +67,20 @@ static int rk808_rtc_readtime(struct device *dev, struct 
> rtc_time *tm)
> /* Force an update of the shadowed registers right now */
> ret = regmap_update_bits(rk808->regmap, RK808_RTC_CTRL_REG,
>  BIT_RTC_CTRL_REG_RTC_GET_TIME,
> -0);
> +BIT_RTC_CTRL_REG_RTC_GET_TIME);
> if (ret) {
> dev_err(dev, "Failed to update bits rtc_ctrl: %d\n", ret);
> return ret;
> }
>
> +   /* After we set the GET_TIME bit, the rtc time couldn't be read
> +* immediately, we should wait up to 31.25 us, about one cycle of
> +* 32khz. If we clear the GET_TIME bit here, the time of i2c transfer
> +* certainly more than 31.25us.
> +*/

Chris, it looks like you swapped the set and the clear of this bit,
and you're relying on the fact that the i2c transaction takes a
certain amount of time after the RTC_GET_TIME BIT is set.   I'm not
sure how long it actually takes, but why not just put in a usleep()
for the minimum wait time?

> ret = regmap_update_bits(rk808->regmap, RK808_RTC_CTRL_REG,
>  BIT_RTC_CTRL_REG_RTC_GET_TIME,
> -BIT_RTC_CTRL_REG_RTC_GET_TIME);
> +0);
> if (ret) {
> dev_err(dev, "Failed to update bits rtc_ctrl: %d\n", ret);
> return ret;
> --
> 1.9.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: arm64 hitting BUG in arch_timer.h

2014-12-10 Thread Sonny Rao

On Wed, Dec 10, 2014 at 12:56 PM, Mark Salter  wrote:
> Using Linus' tree from this morning, I am hitting:
>
>[0.00] BUG: failure at 
> ./arch/arm64/include/asm/arch_timer.h:112/arch_counter_get_cntpct!
>
> This is triggered by commit 0b46b8a718 ("clocksource: arch_timer: Fix
> code to use physical timers when requested") which addresses an armv7
> problem. Arm64 wants to always use a virtual timer. I used this to avoid
> the BUG and get a booting kernel:
>
> diff --git a/drivers/clocksource/arm_arch_timer.c 
> b/drivers/clocksource/arm_arch
> index 71846f9..4d8a01e 100644
> --- a/drivers/clocksource/arm_arch_timer.c
> +++ b/drivers/clocksource/arm_arch_timer.c
> @@ -468,7 +468,7 @@ static void __init arch_counter_register(unsigned type)
>
> /* Register the CP15 based counter if we have one */
> if (type & ARCH_CP15_TIMER) {
> -   if (arch_timer_use_virtual)
> +   if (IS_ENABLED(CONFIG_ARM64) || arch_timer_use_virtual)
> arch_timer_read_counter = arch_counter_get_cntvct;
> else
> arch_timer_read_counter = arch_counter_get_cntpct;
>
>

Yes Catalin has prepared a similar patch:
https://patchwork.kernel.org/patch/5468031/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [alsa-devel] [PATCH v2 2/2] ASoC: rockchip: i2s: add support for grabbing output clock to codec

2014-12-03 Thread Sonny Rao

On Wed, Dec 3, 2014 at 3:22 PM, Dylan Reid  wrote:
> On Wed, Dec 3, 2014 at 3:03 PM, Sonny Rao  wrote:
>> On Wed, Dec 3, 2014 at 12:03 PM, Mark Brown  wrote:
>>> On Wed, Dec 03, 2014 at 11:38:13AM -0800, Sonny Rao wrote:
>>>> On Wed, Dec 3, 2014 at 11:20 AM, Mark Brown  wrote:
>>>
>>>> > I would expect that the clock for the CODEC should be managed by the
>>>> > CODEC if at all possible - that seems more logical than having the CPU
>>>> > I2S controller request and manage it if it's a separate clock.  Why add
>>>> > this to the CPU side driver?
>>>
>>>> This output clock has a mux and can either be a fixed 12Mhz output or
>>>> can be derived from the same fractional divider which drives the i2s
>>>> block.   I thought it was simpler to keep them all the same, but need
>>>> to put ownership in the i2s in anticipation of the i2s driver setting
>>>> it's own clock rate.
>>>
>>>> If you think this is an implementation detail and this output clock
>>>> should just be owned by the codec driver, even though I'm guessing it
>>>> will just have to be the same as i2s, then I think we can drop this
>>>> and make sure simple card (or whatever other codec driver) claims this
>>>> clock.
>>>
>>> simple-card obviously isn't a CODEC driver...
>>
>> Yeah, sorry.
>>
>>> For generality I think
>>> the clock does need to be exposed to the CODEC driver, otherwise this
>>> will work differently to how other systems are working and we can't
>>> substitute in a different clock on the CODEC side so easily if it
>>> doesn't happen to use the output from the I2S block.
>>
>> Ok, then I think what we will do is abandon this patch and I will send
>> something that adds this functionality to the particular codec that
>> I'm interested in -- max98090.
>
> Sorry I didn't read this earlier.  I don't think that this belongs in
> the max98090.  The original patch description is a bit confusing.  The
> clock being grabbed here is actually i2s mclk.  My understanding is
> that, on this SoC, the mclk is driven from a different IP block than
> the rest of the i2s signals.  The i2s driver needs to be told about
> the clock and enable/disable it at the appropriate times.  I'm
> assuming it's optional because there are boards using this SoC with
> i2s slave mode that don't drive mclk at all.
>
> Please correct me if I'm wrong on any of the above.

I don't think you're wrong, and I'm an audio/i2s neophyte so I think
you're probably right and hopefully Mark can confirm that this is how
we want it.

One important thing to point out, which might be causing confusion, is
that this driver is claiming a clock which it internally calls "mclk"
but the way it's specified for rk3288 in the DT, that one is just the
one which drives the internal logic and has a gate.

This clock I'm adding is the actual mclk which is being driven to the
i2s slave device, and it has it's own gate and also has a mux, and we
need to claim both to be able to enable proper clock gating.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/2] ASoC: rockchip: i2s: add support for grabbing output clock to codec

2014-12-03 Thread Sonny Rao

On Wed, Dec 3, 2014 at 12:03 PM, Mark Brown  wrote:
> On Wed, Dec 03, 2014 at 11:38:13AM -0800, Sonny Rao wrote:
>> On Wed, Dec 3, 2014 at 11:20 AM, Mark Brown  wrote:
>
>> > I would expect that the clock for the CODEC should be managed by the
>> > CODEC if at all possible - that seems more logical than having the CPU
>> > I2S controller request and manage it if it's a separate clock.  Why add
>> > this to the CPU side driver?
>
>> This output clock has a mux and can either be a fixed 12Mhz output or
>> can be derived from the same fractional divider which drives the i2s
>> block.   I thought it was simpler to keep them all the same, but need
>> to put ownership in the i2s in anticipation of the i2s driver setting
>> it's own clock rate.
>
>> If you think this is an implementation detail and this output clock
>> should just be owned by the codec driver, even though I'm guessing it
>> will just have to be the same as i2s, then I think we can drop this
>> and make sure simple card (or whatever other codec driver) claims this
>> clock.
>
> simple-card obviously isn't a CODEC driver...

Yeah, sorry.

> For generality I think
> the clock does need to be exposed to the CODEC driver, otherwise this
> will work differently to how other systems are working and we can't
> substitute in a different clock on the CODEC side so easily if it
> doesn't happen to use the output from the I2S block.

Ok, then I think what we will do is abandon this patch and I will send
something that adds this functionality to the particular codec that
I'm interested in -- max98090.

I'm a little tied up at the moment so I'm not going to send that for a
little while, but will come eventually.

Thanks for the advice!
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/2] ASoC: rockchip: i2s: add support for grabbing output clock to codec

2014-12-03 Thread Sonny Rao

On Wed, Dec 3, 2014 at 11:20 AM, Mark Brown  wrote:
> On Wed, Dec 03, 2014 at 03:18:38PM +0800, Jianqun Xu wrote:
>> From: Sonny Rao 
>>
>> We need to claim the clock which is driving the codec so that when we enable
>> clock gating, we continue to clock the codec when needed.  I make this an
>> optional clock since there might be some applications where we don't need it
>> but can still use the I2S block.
>
> I would expect that the clock for the CODEC should be managed by the
> CODEC if at all possible - that seems more logical than having the CPU
> I2S controller request and manage it if it's a separate clock.  Why add
> this to the CPU side driver?

It's a good question.  Right now the way I'm running this stuff we're
mostly setting all the i2s the clock rates from the codec driver, but
I think this isn't the correct way to go, and the i2s driver needs to
set it's rate based on the hw params, but that isn't happening (yet).

This output clock has a mux and can either be a fixed 12Mhz output or
can be derived from the same fractional divider which drives the i2s
block.   I thought it was simpler to keep them all the same, but need
to put ownership in the i2s in anticipation of the i2s driver setting
it's own clock rate.

If you think this is an implementation detail and this output clock
should just be owned by the codec driver, even though I'm guessing it
will just have to be the same as i2s, then I think we can drop this
and make sure simple card (or whatever other codec driver) claims this
clock.

>
> We've not always done this for older systems due to the lack of a usable
> clock API but that's starting to be addressed.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] clk: rockchip: rk3288 export i2s0_clkout for use in DT

2014-11-26 Thread Sonny Rao

On Wed, Nov 26, 2014 at 3:32 PM, Heiko Stübner  wrote:
> Am Dienstag, 18. November 2014, 23:15:19 schrieb Sonny Rao:
>> This exposes the clock that comes out of the i2s block which generally
>> goes to the audio codec.
>>
>> Signed-off-by: Sonny Rao 
>
> applied to my clk branch after removing the CLK_SET_RATE_PARENT

Hi, sorry for the delay, and thanks for fixing it.  I think when I
applied the patch to next-20141118 that had a CLK_SET_RATE_PARENT in
it from this patch:

commit fc69ed70c16a31d6a77ec47a30a9fe941f763f1e
Author: Jianqun 
Date:   Tue Sep 30 11:12:04 2014 +0800

clk: rockchip: rk3288: i2s_frac adds flag to set parent's rate


I agree that is is not necessary and maybe not desirable.  Thanks again!


> Heiko
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] ARM: dts: rk3288: add arm,cpu-registers-not-fw-configured

2014-11-25 Thread Sonny Rao

This will enable use of physical arch timers on rk3288, where each
core comes out of reset with a different virtual offset.  Using
physical timers will help with SMP booting on coreboot and older
u-boot and should also allow suspend-resume and cpu-hotplug to work on
all firmwares.

Firmware which does initialize the cpu registers properly at boot and
cpu-hotplug can remove this property from the device tree.

Signed-off-by: Sonny Rao 
---
 arch/arm/boot/dts/rk3288.dtsi | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi
index 0f50d5d..c861f52 100644
--- a/arch/arm/boot/dts/rk3288.dtsi
+++ b/arch/arm/boot/dts/rk3288.dtsi
@@ -139,6 +139,7 @@
 
timer {
compatible = "arm,armv7-timer";
+   arm,cpu-registers-not-fw-configured;
interrupts = ,
 ,
 ,
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v5] clocksource: arch_timer: Fix code to use physical timers when requested

2014-11-23 Thread Sonny Rao

This is a bug fix for using physical arch timers when
the arch_timer_use_virtual boolean is false.  It restores the
arch_counter_get_cntpct() function after removal in

0d651e4e "clocksource: arch_timer: use virtual counters"

We need this on certain ARMv7 systems which are architected like this:

* The firmware doesn't know and doesn't care about hypervisor mode and
  we don't want to add the complexity of hypervisor there.

* The firmware isn't involved in SMP bringup or resume.

* The ARCH timer come up with an uninitialized offset between the
  virtual and physical counters.  Each core gets a different random
  offset.

* The device boots in "Secure SVC" mode.

* Nothing has touched the reset value of CNTHCTL.PL1PCEN or
  CNTHCTL.PL1PCTEN (both default to 1 at reset)

One example of such as system is RK3288 where it is much simpler to
use the physical counter since there's nobody managing the offset and
each time a core goes down and comes back up it will get reinitialized
to some other random value.

Fixes: 0d651e4e65e9 ("clocksource: arch_timer: use virtual counters")
Cc: sta...@vger.kernel.org
Signed-off-by: Sonny Rao 
Acked-by: Olof Johansson 
---
v2: Add fixes tag to commit message, cc stable, copy Doug's
description of the systems which need this in commit message.
v3: Don't change the memory-mapped physical timer/counter code
v4: remove the memory-mapped physical counter code since it's not used
v5: rebase and make AArch64 version of arch_counter_get_cntpct call BUG()
---
 arch/arm/include/asm/arch_timer.h| 9 +
 arch/arm64/include/asm/arch_timer.h  | 9 +
 drivers/clocksource/arm_arch_timer.c | 5 -
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/arch_timer.h 
b/arch/arm/include/asm/arch_timer.h
index 92793ba..d4ebf56 100644
--- a/arch/arm/include/asm/arch_timer.h
+++ b/arch/arm/include/asm/arch_timer.h
@@ -78,6 +78,15 @@ static inline u32 arch_timer_get_cntfrq(void)
return val;
 }
 
+static inline u64 arch_counter_get_cntpct(void)
+{
+   u64 cval;
+
+   isb();
+   asm volatile("mrrc p15, 0, %Q0, %R0, c14" : "=r" (cval));
+   return cval;
+}
+
 static inline u64 arch_counter_get_cntvct(void)
 {
u64 cval;
diff --git a/arch/arm64/include/asm/arch_timer.h 
b/arch/arm64/include/asm/arch_timer.h
index f190971..b1fa4e6 100644
--- a/arch/arm64/include/asm/arch_timer.h
+++ b/arch/arm64/include/asm/arch_timer.h
@@ -104,6 +104,15 @@ static inline void arch_timer_set_cntkctl(u32 cntkctl)
asm volatile("msr   cntkctl_el1, %0" : : "r" (cntkctl));
 }
 
+static inline u64 arch_counter_get_cntpct(void)
+{
+   /*
+* AArch64 kernel and user space mandate the use of CNTVCT.
+*/
+   BUG();
+   return 0;
+}
+
 static inline u64 arch_counter_get_cntvct(void)
 {
u64 cval;
diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index 43005d4..1fa2af9 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -462,7 +462,10 @@ static void __init arch_counter_register(unsigned type)
 
/* Register the CP15 based counter if we have one */
if (type & ARCH_CP15_TIMER) {
-   arch_timer_read_counter = arch_counter_get_cntvct;
+   if (arch_timer_use_virtual)
+   arch_timer_read_counter = arch_counter_get_cntvct;
+   else
+   arch_timer_read_counter = arch_counter_get_cntpct;
} else {
arch_timer_read_counter = arch_counter_get_cntvct_mem;
 
-- 
2.1.0.rc2.206.gedb03e5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v4] clocksource: arch_timer: Fix code to use physical timers when requested

2014-11-23 Thread Sonny Rao

On Fri, Nov 21, 2014 at 12:58 PM, Olof Johansson  wrote:
> On Thu, Nov 20, 2014 at 8:58 AM, Catalin Marinas
>  wrote:
>> Doug,
>>
>> On Thu, Nov 20, 2014 at 04:24:09PM +, Doug Anderson wrote:
>>> On Thu, Nov 20, 2014 at 8:10 AM, Catalin Marinas
>>>  wrote:
>>> > On Wed, Oct 08, 2014 at 08:38:57AM +0100, Sonny Rao wrote:
>>> >> This is a bug fix for using physical arch timers when
>>> >> the arch_timer_use_virtual boolean is false.  It restores the
>>> >> arch_counter_get_cntpct() function after removal in
>>> >>
>>> >> 0d651e4e "clocksource: arch_timer: use virtual counters"
>>> >>
>>> >> We need this on certain ARMv7 systems which are architected like this:
>>> >>
>>> >> * The firmware doesn't know and doesn't care about hypervisor mode and
>>> >>   we don't want to add the complexity of hypervisor there.
>>> >>
>>> >> * The firmware isn't involved in SMP bringup or resume.
>>> >>
>>> >> * The ARCH timer come up with an uninitialized offset between the
>>> >>   virtual and physical counters.  Each core gets a different random
>>> >>   offset.
>>> >>
>>> >> * The device boots in "Secure SVC" mode.
>>> >>
>>> >> * Nothing has touched the reset value of CNTHCTL.PL1PCEN or
>>> >>   CNTHCTL.PL1PCTEN (both default to 1 at reset)
>>> >>
>>> >> One example of such as system is RK3288 where it is much simpler to
>>> >> use the physical counter since there's nobody managing the offset and
>>> >> each time a core goes down and comes back up it will get reinitialized
>>> >> to some other random value.
>>> >>
>>> >> Fixes: 0d651e4e65e9 ("clocksource: arch_timer: use virtual counters")
>>> >> Cc: sta...@vger.kernel.org
>>> >> Signed-off-by: Sonny Rao 
>>> >> Acked-by: Olof Johansson 
>>> > [...]
>>> >> --- a/arch/arm64/include/asm/arch_timer.h
>>> >> +++ b/arch/arm64/include/asm/arch_timer.h
>>> >> @@ -135,6 +135,16 @@ static inline void arch_timer_evtstrm_enable(int 
>>> >> divider)
>>> >>  #endif
>>> >>  }
>>> >>
>>> >> +static inline u64 arch_counter_get_cntpct(void)
>>> >> +{
>>> >> + u64 cval;
>>> >> +
>>> >> + isb();
>>> >> + asm volatile("mrs %0, cntpct_el0" : "=r" (cval));
>>> >> +
>>> >> + return cval;
>>> >> +}
>>> >
>>> > Sorry but I have to NAK the arm64 changes here. If the firmware is
>>> > broken and does not initialise CNTVOFF properly, please fix it (at least
>>> > on ARMv8 hardware). Also, on arm64 the vdso gettimeofday()
>>> > implementation relies on using the virtual counter, so correct
>>> > initialisation of CNTVOFF is essential.
>>>
>>> Sonny's patch here just makes it so that we honor the global variable.
>>> My patch at <https://patchwork.kernel.org/patch/5051881/> is the one
>>> that allows the global variable to be set.  You can see in that patch
>>> that it's impossible for the variable to be set on ARM64.
>>
>> It just gives people ideas ;), thinking they only need to remove
>> IS_ENABLED(CONFIG_ARM) in your patch and get this working on arm64.
>>
>>> In previous discussions it was agreed that on ARM64 psci (or something
>>> similar) was a requirement anyway and that gave us a way to get the
>>> firmware involved again if we ever need to bring down a processor and
>>> bring it back up in the kernel.  PSCI is not a requirement for ARM32.
>>> There are systems that don't get the firmware involved when a
>>> processor loses state (like if it is powered off and powered on again,
>>> maybe for suspend/resume) and there was pushback against the kernel
>>> itself transitioning into monitor mode to init CNTVOFF in these cases.
>>> People agreed a month ago that these two patches were a reasonable
>>> approach for ARM32.
>>
>> I'm not complaining about about arm32 here, just the arm64
>> implementation. If you want to avoid #ifdefs in the arch timer driver,
>> what about, for arm64, defining something like:
>>
>> static inline u64 arch_counter_get_cntpct(void)
>> {
>> /*
>>  * AArch64 kernel and user space mandate the use of CNTVCT.
>>  */
>> BUG();
>> return 0;
>> }
>
> Seems like a reasonable approach to me.

Ok, I will re-spin this one, sorry for the delay.

>
>
> -Olof
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] clk: rockchip: rk3288 export i2s0_clkout for use in DT

2014-11-23 Thread Sonny Rao

On Sun, Nov 23, 2014 at 4:07 PM, Heiko Stübner  wrote:
> Hi Sonny,
>
> Am Dienstag, 18. November 2014, 23:15:19 schrieb Sonny Rao:
>> This exposes the clock that comes out of the i2s block which generally
>> goes to the audio codec.
>>
>> Signed-off-by: Sonny Rao 
>> ---
>>  drivers/clk/rockchip/clk-rk3288.c  | 3 ++-
>>  include/dt-bindings/clock/rk3288-cru.h | 1 +
>>  2 files changed, 3 insertions(+), 1 deletion(-)
>>
>
> [...]
>
>> diff --git a/include/dt-bindings/clock/rk3288-cru.h
>> b/include/dt-bindings/clock/rk3288-cru.h index 100a08c..4acc730 100644
>> --- a/include/dt-bindings/clock/rk3288-cru.h
>> +++ b/include/dt-bindings/clock/rk3288-cru.h
>> @@ -71,6 +71,7 @@
>>  #define SCLK_HDMI_CEC110
>>  #define SCLK_HEVC_CABAC  111
>>  #define SCLK_HEVC_CORE   112
>> +#define SCLK_I2S0_OUT   113
>>
>>  #define DCLK_VOP0190
>>  #define DCLK_VOP1191
>
> just to get branches right, do you plan on sending a patch using this new
> clock-id in a devicetree file in time for 3.19 (i.e. during the next week).
>
> If you plan on doing this, we'll need a 2-patch series like Alexandru did for
> the mmc phases [because we would need a shared branch between clk and dts
> branches]. If not the patch can stay as it is.
>

Hi, I'm not planning on sending anything with this new clock in the
immediate future, so I think you can take it as is.  Eventually, we
will submit something that uses it for the audio codec on Pinky using
simple-card but I don't have that ready yet.  Thanks!

> Thanks
> Heiko
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v4] clocksource: arch_timer: Fix code to use physical timers when requested

2014-11-20 Thread Sonny Rao

On Thu, Nov 20, 2014 at 12:49 AM, Maxime Ripard
 wrote:
> Hi,
>
> On Wed, Oct 08, 2014 at 12:38:57AM -0700, Sonny Rao wrote:
>> This is a bug fix for using physical arch timers when
>> the arch_timer_use_virtual boolean is false.  It restores the
>> arch_counter_get_cntpct() function after removal in
>>
>> 0d651e4e "clocksource: arch_timer: use virtual counters"
>>
>> We need this on certain ARMv7 systems which are architected like this:
>>
>> * The firmware doesn't know and doesn't care about hypervisor mode and
>>   we don't want to add the complexity of hypervisor there.
>>
>> * The firmware isn't involved in SMP bringup or resume.
>>
>> * The ARCH timer come up with an uninitialized offset between the
>>   virtual and physical counters.  Each core gets a different random
>>   offset.
>>
>> * The device boots in "Secure SVC" mode.
>>
>> * Nothing has touched the reset value of CNTHCTL.PL1PCEN or
>>   CNTHCTL.PL1PCTEN (both default to 1 at reset)
>>
>> One example of such as system is RK3288 where it is much simpler to
>> use the physical counter since there's nobody managing the offset and
>> each time a core goes down and comes back up it will get reinitialized
>> to some other random value.
>>
>> Fixes: 0d651e4e65e9 ("clocksource: arch_timer: use virtual counters")
>> Cc: sta...@vger.kernel.org
>> Signed-off-by: Sonny Rao 
>> Acked-by: Olof Johansson 
>
> Has this been merged yet?
>
> If not, you can add my Tested-by, it makes the Allwinner A31 boot
> flawlessly with the arch timers (together with the patch "clocksource:
> arch_timer: Allow the device tree to specify uninitialized timer
> registers")

No, it has not been merged, and Doug just pinged Daniel about status
on the other patch you mentioned.  I'm glad these patches are useful
to you another system, hopefully this will help the case for
inclusion.

Daniel, Mark, Will, others, is there any objection to this patch?  If
not could we please merge?


> Thanks!
> Maxime
>
> Maxime
>
> --
> Maxime Ripard, Free Electrons
> Embedded Linux, Kernel and Android engineering
> http://free-electrons.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] clk: rockchip: rk3288 export i2s0_clkout for use in DT

2014-11-18 Thread Sonny Rao

This exposes the clock that comes out of the i2s block which generally
goes to the audio codec.

Signed-off-by: Sonny Rao 
---
 drivers/clk/rockchip/clk-rk3288.c  | 3 ++-
 include/dt-bindings/clock/rk3288-cru.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/clk/rockchip/clk-rk3288.c 
b/drivers/clk/rockchip/clk-rk3288.c
index 2327829..837 100644
--- a/drivers/clk/rockchip/clk-rk3288.c
+++ b/drivers/clk/rockchip/clk-rk3288.c
@@ -305,7 +305,8 @@ static struct rockchip_clk_branch rk3288_clk_branches[] 
__initdata = {
RK3288_CLKGATE_CON(4), 2, GFLAGS),
MUX(0, "i2s_pre", mux_i2s_pre_p, CLK_SET_RATE_PARENT,
RK3288_CLKSEL_CON(4), 8, 2, MFLAGS),
-   COMPOSITE_NODIV(0, "i2s0_clkout", mux_i2s_clkout_p, CLK_SET_RATE_PARENT,
+   COMPOSITE_NODIV(SCLK_I2S0_OUT, "i2s0_clkout", mux_i2s_clkout_p,
+   CLK_SET_RATE_PARENT,
RK3288_CLKSEL_CON(4), 12, 1, MFLAGS,
RK3288_CLKGATE_CON(4), 0, GFLAGS),
GATE(SCLK_I2S0, "sclk_i2s0", "i2s_pre", CLK_SET_RATE_PARENT,
diff --git a/include/dt-bindings/clock/rk3288-cru.h 
b/include/dt-bindings/clock/rk3288-cru.h
index 100a08c..4acc730 100644
--- a/include/dt-bindings/clock/rk3288-cru.h
+++ b/include/dt-bindings/clock/rk3288-cru.h
@@ -71,6 +71,7 @@
 #define SCLK_HDMI_CEC  110
 #define SCLK_HEVC_CABAC111
 #define SCLK_HEVC_CORE 112
+#define SCLK_I2S0_OUT   113
 
 #define DCLK_VOP0  190
 #define DCLK_VOP1  191
-- 
2.1.0.rc2.206.gedb03e5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] ASoC: rockchip: i2s: add support for grabbing output clock to codec

2014-11-18 Thread Sonny Rao

Jay,

On Tue, Nov 18, 2014 at 7:07 PM, Jianqun Xu  wrote:

Mostly FYI, but if you take someone else's patch, you should also
retain their authorship by saying:
From: 
at the top of the message. I don't really mind, but please keep it in
mind for the future, thanks.

> We need to claim the clock which is driving the codec so that when we enable
> clock gating, we continue to clock the codec when needed.  I make this an
> optional clock since there might be some applications where we don't need it
> but can still use the I2S block.
>
> Signed-off-by: Sonny Rao 
> Signed-off-by: Jianqun Xu 
> ---
>  sound/soc/rockchip/rockchip_i2s.c | 11 +++
>  1 file changed, 11 insertions(+)
>
> diff --git a/sound/soc/rockchip/rockchip_i2s.c 
> b/sound/soc/rockchip/rockchip_i2s.c
> index c74ba37..2820ade 100644
> --- a/sound/soc/rockchip/rockchip_i2s.c
> +++ b/sound/soc/rockchip/rockchip_i2s.c
> @@ -28,6 +28,7 @@ struct rk_i2s_dev {
>
> struct clk *hclk;
> struct clk *mclk;
> +   struct clk *oclk;
>
> struct snd_dmaengine_dai_dma_data capture_dma_data;
> struct snd_dmaengine_dai_dma_data playback_dma_data;
> @@ -439,6 +440,14 @@ static int rockchip_i2s_probe(struct platform_device 
> *pdev)
> return PTR_ERR(i2s->mclk);
> }
>
> +   i2s->oclk = devm_clk_get(&pdev->dev, "i2s_clk_out");
> +   if (IS_ERR(i2s->oclk)) {
> +   dev_dbg(&pdev->dev, "Didn't find output clock\n");
> +   i2s->oclk = NULL;
> +   }
> +   if (i2s->oclk)
> +   ret = clk_prepare_enable(i2s->oclk);
> +
> res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> regs = devm_ioremap_resource(&pdev->dev, res);
> if (IS_ERR(regs))
> @@ -505,6 +514,8 @@ static int rockchip_i2s_remove(struct platform_device 
> *pdev)
> if (!pm_runtime_status_suspended(&pdev->dev))
> i2s_runtime_suspend(&pdev->dev);
>
> +   if (i2s->oclk)
> +   clk_disable_unprepare(i2s->oclk);
> clk_disable_unprepare(i2s->mclk);
> clk_disable_unprepare(i2s->hclk);
> snd_dmaengine_pcm_unregister(&pdev->dev);
> --
> 1.9.1
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2] mmc: dw_mmc: Reset DMA before enabling IDMAC

2014-10-18 Thread Sonny Rao

On Fri, Oct 17, 2014 at 1:26 AM, Jaehoon Chung  wrote:
> Hi, Sonny.
>
> On 10/17/2014 01:58 AM, Sonny Rao wrote:
>> We've already got a reset of DMA after it's done.  Add one before we
>> start DMA too.  This fixes a data corruption on Rockchip SoCs which
>> will get bad data when doing a DMA transfer after doing a PIO transfer.
>>
>> We tested this on an Exynos 5800 with HS200 and didn't notice any
>> difference in sequential read throughput.
>
> Didn't affect the write throughput?

Write is usually much slower than read, but I went ahead and re-tested
and saw no difference on writes.

> I tested this on exynos3/4 with DDR50 and HS200.
>
> Acked-by: Jaehoon Chung 
> Tested-by: Jaehoon Chung 
>
>>
>> Signed-off-by: Sonny Rao 
>> Signed-off-by: Doug Anderson 
>> Tested-by: Doug Anderson 
>> ---
>>  drivers/mmc/host/dw_mmc.c | 5 +
>>  1 file changed, 5 insertions(+)
>>
>> diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
>> index 69f0cc6..ca67f69 100644
>> --- a/drivers/mmc/host/dw_mmc.c
>> +++ b/drivers/mmc/host/dw_mmc.c
>> @@ -83,6 +83,7 @@ struct idmac_desc {
>>  #endif /* CONFIG_MMC_DW_IDMAC */
>>
>>  static bool dw_mci_reset(struct dw_mci *host);
>> +static bool dw_mci_ctrl_reset(struct dw_mci *host, u32 reset);
>>
>>  #if defined(CONFIG_DEBUG_FS)
>>  static int dw_mci_req_show(struct seq_file *s, void *v)
>> @@ -448,6 +449,10 @@ static void dw_mci_idmac_start_dma(struct dw_mci *host, 
>> unsigned int sg_len)
>>
>>   dw_mci_translate_sglist(host, host->data, sg_len);
>>
>> + /* Make sure to reset DMA in case we did PIO before this */
>> + dw_mci_ctrl_reset(host, SDMMC_CTRL_DMA_RESET);
>> + dw_mci_idmac_reset(host);
>> +
>>   /* Select IDMAC interface */
>>   temp = mci_readl(host, CTRL);
>>   temp |= SDMMC_CTRL_USE_IDMAC;
>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v5 1/6] ARM: rockchip: convert to regmap and use pmu syscon if available

2014-10-17 Thread Sonny Rao

On Wed, Oct 15, 2014 at 10:23 AM, Kever Yang  wrote:
> From: Heiko Stuebner 
>
> The pmu register space is - like the GRF - shared by quite some peripherals.
> On the rk3188 and rk3288 even parts of the pinctrl are living there.
> Therefore we normally shouldn't map it a second time when the syscon
> does this already.
>
> Therefore convert the cpu power-domain handling to access the pmu via a
> regmap and at first try to get it via the syscon interface.
> Getting this syscon will only fail if the pmu node does not have the
> "syscon" compatible and thus does not get shared with other drivers.
>
> In this case we map it like before and create the necessary regmap on
> top of it.
>
> Signed-off-by: Heiko Stuebner 
> Signed-off-by: Kever Yang 
> ---
>
> Changes in v5: None
> Changes in v4: None
> Changes in v3:
> - add this patch in version 3
>
> Changes in v2: None
>
>  arch/arm/mach-rockchip/platsmp.c | 104 
> +--
>  1 file changed, 78 insertions(+), 26 deletions(-)
>
> diff --git a/arch/arm/mach-rockchip/platsmp.c 
> b/arch/arm/mach-rockchip/platsmp.c
> index 189684f..4c36fbf 100644
> --- a/arch/arm/mach-rockchip/platsmp.c
> +++ b/arch/arm/mach-rockchip/platsmp.c
> @@ -19,6 +19,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>
>  #include 
>  #include 
> @@ -37,23 +39,42 @@ static int ncores;
>
>  #define PMU_PWRDN_SCU  4
>
> -static void __iomem *pmu_base_addr;
> +static struct regmap *pmu;
>
> -static inline bool pmu_power_domain_is_on(int pd)
> +static int pmu_power_domain_is_on(int pd)
>  {
> -   return !(readl_relaxed(pmu_base_addr + PMU_PWRDN_ST) & BIT(pd));
> +   u32 val;
> +   int ret;
> +
> +   ret = regmap_read(pmu, PMU_PWRDN_ST, &val);
> +   if (ret < 0)
> +   return ret;
> +
> +   return !(val & BIT(pd));
>  }
>
> -static void pmu_set_power_domain(int pd, bool on)
> +static int pmu_set_power_domain(int pd, bool on)
>  {
> -   u32 val = readl_relaxed(pmu_base_addr + PMU_PWRDN_CON);
> -   if (on)
> -   val &= ~BIT(pd);
> -   else
> -   val |=  BIT(pd);
> -   writel(val, pmu_base_addr + PMU_PWRDN_CON);
> -
> -   while (pmu_power_domain_is_on(pd) != on) { }
> +   u32 val = (on) ? 0 : BIT(pd);
> +   int ret;
> +
> +   ret = regmap_update_bits(pmu, PMU_PWRDN_CON, BIT(pd), val);
> +   if (ret < 0) {
> +   pr_err("%s: could not update power domain\n", __func__);
> +   return ret;
> +   }
> +
> +   ret = -1;
> +   while (ret != on) {
> +   ret = pmu_power_domain_is_on(pd);
> +   if (ret < 0) {
> +   pr_err("%s: could not read power domain state\n",
> +__func__);
> +   return ret;
> +   }
> +   }
> +
> +   return 0;
>  }
>
>  /*
> @@ -63,7 +84,7 @@ static void pmu_set_power_domain(int pd, bool on)
>  static int __cpuinit rockchip_boot_secondary(unsigned int cpu,
>  struct task_struct *idle)
>  {
> -   if (!sram_base_addr || !pmu_base_addr) {
> +   if (!sram_base_addr || !pmu) {
> pr_err("%s: sram or pmu missing for cpu boot\n", __func__);
> return -ENXIO;
> }
> @@ -75,9 +96,7 @@ static int __cpuinit rockchip_boot_secondary(unsigned int 
> cpu,
> }
>
> /* start the core */
> -   pmu_set_power_domain(0 + cpu, true);
> -
> -   return 0;
> +   return pmu_set_power_domain(0 + cpu, true);
>  }
>
>  /**
> @@ -125,6 +144,48 @@ static int __init rockchip_smp_prepare_sram(struct 
> device_node *node)
> return 0;
>  }
>
> +static struct regmap_config rockchip_pmu_regmap_config = {
> +   .reg_bits = 32,
> +   .val_bits = 32,
> +   .reg_stride = 4,
> +};
> +
> +static int __init rockchip_smp_prepare_pmu(void)
> +{
> +   struct device_node *node;
> +   void __iomem *pmu_base;
> +
> +   pmu = syscon_regmap_lookup_by_compatible("rockchip,rk3066-pmu");
> +   if (!IS_ERR(pmu))
> +   return 0;
> +
> +   /* fallback, create our own regmap for the pmu area */

I don't think you need this fallback, the syscon driver should take
care of mapping and creating the regmap for you -- assuming that the
pmu node has the "syscon" property.

> +   pmu = NULL;
> +   node = of_find_compatible_node(NULL, NULL, "rockchip,rk3066-pmu");
> +   if (!node) {
> +   pr_err("%s: could not find pmu dt node\n", __func__);
> +   return -ENODEV;
> +   }
> +
> +   pmu_base = of_iomap(node, 0);
> +   if (!pmu_base) {
> +   pr_err("%s: could not map pmu registers\n", __func__);
> +   return -ENOMEM;
> +   }
> +
> +   pmu = regmap_init_mmio(NULL, pmu_base, &rockchip_pmu_regmap_config);
> +   if (IS_ERR(pmu)) {
> +   int ret = PTR_ERR(pmu);
> +
> +   iounmap(pmu_base);
> +

[PATCH v2] mmc: dw_mmc: Reset DMA before enabling IDMAC

2014-10-16 Thread Sonny Rao

We've already got a reset of DMA after it's done.  Add one before we
start DMA too.  This fixes a data corruption on Rockchip SoCs which
will get bad data when doing a DMA transfer after doing a PIO transfer.

We tested this on an Exynos 5800 with HS200 and didn't notice any
difference in sequential read throughput.

Signed-off-by: Sonny Rao 
Signed-off-by: Doug Anderson 
Tested-by: Doug Anderson 
---
 drivers/mmc/host/dw_mmc.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index 69f0cc6..ca67f69 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -83,6 +83,7 @@ struct idmac_desc {
 #endif /* CONFIG_MMC_DW_IDMAC */
 
 static bool dw_mci_reset(struct dw_mci *host);
+static bool dw_mci_ctrl_reset(struct dw_mci *host, u32 reset);
 
 #if defined(CONFIG_DEBUG_FS)
 static int dw_mci_req_show(struct seq_file *s, void *v)
@@ -448,6 +449,10 @@ static void dw_mci_idmac_start_dma(struct dw_mci *host, 
unsigned int sg_len)
 
dw_mci_translate_sglist(host, host->data, sg_len);
 
+   /* Make sure to reset DMA in case we did PIO before this */
+   dw_mci_ctrl_reset(host, SDMMC_CTRL_DMA_RESET);
+   dw_mci_idmac_reset(host);
+
/* Select IDMAC interface */
temp = mci_readl(host, CTRL);
temp |= SDMMC_CTRL_USE_IDMAC;
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] clk: rockchip: fix parent for spdif_8ch_frac on rk3288

2014-10-08 Thread Sonny Rao

The parent should be spdif_8ch_pre not spdif_8ch_src, which doesn't
exist and looks to be a typo.  The TRM also confirms this.

Signed-off-by: Sonny Rao 
---
 drivers/clk/rockchip/clk-rk3288.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/clk/rockchip/clk-rk3288.c 
b/drivers/clk/rockchip/clk-rk3288.c
index 2327829..e41ae1f 100644
--- a/drivers/clk/rockchip/clk-rk3288.c
+++ b/drivers/clk/rockchip/clk-rk3288.c
@@ -325,7 +325,7 @@ static struct rockchip_clk_branch rk3288_clk_branches[] 
__initdata = {
COMPOSITE_NOMUX(0, "spdif_8ch_pre", "spdif_src", 0,
RK3288_CLKSEL_CON(40), 0, 7, DFLAGS,
RK3288_CLKGATE_CON(4), 7, GFLAGS),
-   COMPOSITE_FRAC(0, "spdif_8ch_frac", "spdif_8ch_src", 0,
+   COMPOSITE_FRAC(0, "spdif_8ch_frac", "spdif_8ch_pre", 0,
RK3288_CLKSEL_CON(41), 0,
RK3288_CLKGATE_CON(4), 8, GFLAGS),
COMPOSITE_NODIV(SCLK_SPDIF8CH, "sclk_spdif_8ch", mux_spdif_8ch_p, 0,
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] ASoC: rockchip-i2s: fix infinite loop in rockchip_snd_txctrl

2014-10-08 Thread Sonny Rao

We can get into an infinite loop if the I2S_CLR register fails to
clear due to a missing break statement, so add that.

Signed-off-by: Sonny Rao 
---
 sound/soc/rockchip/rockchip_i2s.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sound/soc/rockchip/rockchip_i2s.c 
b/sound/soc/rockchip/rockchip_i2s.c
index 033487c..f373e37 100644
--- a/sound/soc/rockchip/rockchip_i2s.c
+++ b/sound/soc/rockchip/rockchip_i2s.c
@@ -108,8 +108,10 @@ static void rockchip_snd_txctrl(struct rk_i2s_dev *i2s, 
int on)
while (val) {
regmap_read(i2s->regmap, I2S_CLR, &val);
retry--;
-   if (!retry)
+   if (!retry) {
dev_warn(i2s->dev, "fail to clear\n");
+   break;
+   }
}
}
}
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v4] clocksource: arch_timer: Fix code to use physical timers when requested

2014-10-08 Thread Sonny Rao

This is a bug fix for using physical arch timers when
the arch_timer_use_virtual boolean is false.  It restores the
arch_counter_get_cntpct() function after removal in

0d651e4e "clocksource: arch_timer: use virtual counters"

We need this on certain ARMv7 systems which are architected like this:

* The firmware doesn't know and doesn't care about hypervisor mode and
  we don't want to add the complexity of hypervisor there.

* The firmware isn't involved in SMP bringup or resume.

* The ARCH timer come up with an uninitialized offset between the
  virtual and physical counters.  Each core gets a different random
  offset.

* The device boots in "Secure SVC" mode.

* Nothing has touched the reset value of CNTHCTL.PL1PCEN or
  CNTHCTL.PL1PCTEN (both default to 1 at reset)

One example of such as system is RK3288 where it is much simpler to
use the physical counter since there's nobody managing the offset and
each time a core goes down and comes back up it will get reinitialized
to some other random value.

Fixes: 0d651e4e65e9 ("clocksource: arch_timer: use virtual counters")
Cc: sta...@vger.kernel.org
Signed-off-by: Sonny Rao 
Acked-by: Olof Johansson 
---
v2: Add fixes tag to commit message, cc stable, copy Doug's
description of the systems which need this in commit message.
v3: Don't change the memory-mapped physical timer/counter code
v4: remove the memory-mapped physical counter code since it's not used
---
 arch/arm/include/asm/arch_timer.h|  9 +
 arch/arm64/include/asm/arch_timer.h  | 10 ++
 drivers/clocksource/arm_arch_timer.c | 10 +++---
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/arch/arm/include/asm/arch_timer.h 
b/arch/arm/include/asm/arch_timer.h
index 0704e0c..e72aa4d 100644
--- a/arch/arm/include/asm/arch_timer.h
+++ b/arch/arm/include/asm/arch_timer.h
@@ -78,6 +78,15 @@ static inline u32 arch_timer_get_cntfrq(void)
return val;
 }
 
+static inline u64 arch_counter_get_cntpct(void)
+{
+   u64 cval;
+
+   isb();
+   asm volatile("mrrc p15, 0, %Q0, %R0, c14" : "=r" (cval));
+   return cval;
+}
+
 static inline u64 arch_counter_get_cntvct(void)
 {
u64 cval;
diff --git a/arch/arm64/include/asm/arch_timer.h 
b/arch/arm64/include/asm/arch_timer.h
index 9400596..58657c4 100644
--- a/arch/arm64/include/asm/arch_timer.h
+++ b/arch/arm64/include/asm/arch_timer.h
@@ -135,6 +135,16 @@ static inline void arch_timer_evtstrm_enable(int divider)
 #endif
 }
 
+static inline u64 arch_counter_get_cntpct(void)
+{
+   u64 cval;
+
+   isb();
+   asm volatile("mrs %0, cntpct_el0" : "=r" (cval));
+
+   return cval;
+}
+
 static inline u64 arch_counter_get_cntvct(void)
 {
u64 cval;
diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index 6b50311..799139f 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -429,10 +429,14 @@ static void __init arch_counter_register(unsigned type)
u64 start_count;
 
/* Register the CP15 based counter if we have one */
-   if (type & ARCH_CP15_TIMER)
-   arch_timer_read_counter = arch_counter_get_cntvct;
-   else
+   if (type & ARCH_CP15_TIMER) {
+   if (arch_timer_use_virtual)
+   arch_timer_read_counter = arch_counter_get_cntvct;
+   else
+   arch_timer_read_counter = arch_counter_get_cntpct;
+   } else {
arch_timer_read_counter = arch_counter_get_cntvct_mem;
+   }
 
start_count = arch_timer_read_counter();
clocksource_register_hz(&clocksource_counter, arch_timer_rate);
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v4] clocksource: arch_timer: Allow the device tree to specify uninitialized timer registers

2014-10-08 Thread Sonny Rao

From: Doug Anderson 

Some 32-bit (ARMv7) systems are architected like this:

* The firmware doesn't know and doesn't care about hypervisor mode and
  we don't want to add the complexity of hypervisor there.

* The firmware isn't involved in SMP bringup or resume.

* The ARCH timer come up with an uninitialized offset (CNTVOFF)
  between the virtual and physical counters.  Each core gets a
  different random offset.

* The device boots in "Secure SVC" mode.

* Nothing has touched the reset value of CNTHCTL.PL1PCEN or
  CNTHCTL.PL1PCTEN (both default to 1 at reset)

On systems like the above, it doesn't make sense to use the virtual
counter.  There's nobody managing the offset and each time a core goes
down and comes back up it will get reinitialized to some other random
value.

This adds an optional property which can inform the kernel of this
situation, and firmware is free to remove the property if it is going
to initialize the CNTVOFF registers when each CPU comes out of reset.

Currently, the best course of action in this case is to use the
physical timer, which is why it is important that CNTHCTL hasn't been
changed from its reset value and it's a reasonable assumption given
that the firmware has never entered HYP mode.

Note that it's been said that on ARMv8 systems the firmware and
kernel really can't be architected as described above.  That means
using the physical timer like this really only makes sense for ARMv7
systems.

Signed-off-by: Doug Anderson 
Signed-off-by: Sonny Rao 
Reviewed-by: Mark Rutland 
---
Changes in v2:
- Add "#ifdef CONFIG_ARM" as per Will Deacon

Changes in v3:
- change property name to arm,cntvoff-not-fw-configured and specify
  that the value of CNTHCTL.PL1PC(T)EN must still be the reset value
  of 1 as per Mark Rutland

Changes in v4:
- change property name to arm,cpu-registers-not-fw-configured and
  specify that all cpu registers must have architected reset values
  per Mark Rutland
- change from "#ifdef CONFIG_ARM" to "if (IS_ENABLED(CONFIG_ARM))" per
  Arnd Bergmann
---
 Documentation/devicetree/bindings/arm/arch_timer.txt | 8 
 drivers/clocksource/arm_arch_timer.c | 8 
 2 files changed, 16 insertions(+)

diff --git a/Documentation/devicetree/bindings/arm/arch_timer.txt 
b/Documentation/devicetree/bindings/arm/arch_timer.txt
index 37b2caf..256b4d8 100644
--- a/Documentation/devicetree/bindings/arm/arch_timer.txt
+++ b/Documentation/devicetree/bindings/arm/arch_timer.txt
@@ -22,6 +22,14 @@ to deliver its interrupts via SPIs.
 - always-on : a boolean property. If present, the timer is powered through an
   always-on power domain, therefore it never loses context.
 
+** Optional properties:
+
+- arm,cpu-registers-not-fw-configured : Firmware does not initialize
+  any of the generic timer CPU registers, which contain their
+  architecturally-defined reset values. Only supported for 32-bit
+  systems which follow the ARMv7 architected reset values.
+
+
 Example:
 
timer {
diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index 8daf056..799139f 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -654,6 +654,14 @@ static void __init arch_timer_init(struct device_node *np)
arch_timer_detect_rate(NULL, np);
 
/*
+* If we cannot rely on firmware initializing the timer registers then
+* we should use the physical timers instead.
+*/
+   if (IS_ENABLED(CONFIG_ARM) &&
+   of_property_read_bool(np, "arm,cpu-registers-not-fw-configured"))
+   arch_timer_use_virtual = false;
+
+   /*
 * If HYP mode is available, we know that the physical timer
 * has been configured to be accessible from PL1. Use it, so
 * that a guest can use the virtual timer instead.
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3] clocksource: arch_timer: Allow the device tree to specify uninitialized CNTVOFF

2014-10-06 Thread Sonny Rao

From: Doug Anderson 

Some 32-bit (ARMv7) systems are architected like this:

* The firmware doesn't know and doesn't care about hypervisor mode and
  we don't want to add the complexity of hypervisor there.

* The firmware isn't involved in SMP bringup or resume.

* The ARCH timer come up with an uninitialized offset (CNTVOFF)
  between the virtual and physical counters.  Each core gets a
  different random offset.

* The device boots in "Secure SVC" mode.

* Nothing has touched the reset value of CNTHCTL.PL1PCEN or
  CNTHCTL.PL1PCTEN (both default to 1 at reset)

On systems like the above, it doesn't make sense to use the virtual
counter.  There's nobody managing the offset and each time a core goes
down and comes back up it will get reinitialized to some other random
value.

This adds an optional property which can inform the kernel of this
situation, and firmware is free to remove the property if it is going
to initialize the CNTVOFF registers when each CPU comes out of reset.

Currently, the best course of action in this case is to use the
physical timer, which is why it is important that CNTHCTL hasn't been
changed from its reset value and it's a reasonable assumption given
that the firmware has never entered HYP mode.

Note that it's been said that ARM64 (ARMv8) systems the firmware and
kernel really can't be architected as described above.  That means
using the physical timer like this really only makes sense for ARMv7
systems.

Signed-off-by: Doug Anderson 
Signed-off-by: Sonny Rao 
---
Changes in v2:
- Add "#ifdef CONFIG_ARM" as per Will Deacon

Changes in v3:
- change property name to arm,cntvoff-not-fw-configured and specify
  that the value of CNTHCTL.PL1PC(T)EN must still be the reset value
  of 1 as per Mark Rutland
---
 Documentation/devicetree/bindings/arm/arch_timer.txt | 8 
 drivers/clocksource/arm_arch_timer.c | 9 +
 2 files changed, 17 insertions(+)

diff --git a/Documentation/devicetree/bindings/arm/arch_timer.txt 
b/Documentation/devicetree/bindings/arm/arch_timer.txt
index 37b2caf..67837c9 100644
--- a/Documentation/devicetree/bindings/arm/arch_timer.txt
+++ b/Documentation/devicetree/bindings/arm/arch_timer.txt
@@ -22,6 +22,14 @@ to deliver its interrupts via SPIs.
 - always-on : a boolean property. If present, the timer is powered through an
   always-on power domain, therefore it never loses context.
 
+** Optional properties:
+
+- arm,cntvoff-not-fw-configured : Firmware does not initialize
+  CNTVOFF, which may reset to arbitrary and different values on each
+  CPU.  CNTHCTL.PL1PC(T)EN must both be 1, which is the reset value
+  specificed by the architecture.   Only supported for ARM (not ARM64).
+
+
 Example:
 
timer {
diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index bd8da15..234d7b9 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -668,6 +668,15 @@ static void __init arch_timer_init(struct device_node *np)
arch_timer_ppi[i] = irq_of_parse_and_map(np, i);
arch_timer_detect_rate(NULL, np);
 
+#ifdef CONFIG_ARM
+   /*
+* If we cannot rely on firmware initializing the CNTVOFF then
+* we should use the physical timers instead.
+*/
+   if (of_property_read_bool(np, "arm,cntvoff-not-fw-configured"))
+   arch_timer_use_virtual = false;
+#endif
+
/*
 * If HYP mode is available, we know that the physical timer
 * has been configured to be accessible from PL1. Use it, so
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3] clocksource: arch_timer: Fix code to use physical timers when requested

2014-10-06 Thread Sonny Rao

This is a bug fix for using physical arch timers when
the arch_timer_use_virtual boolean is false.  It restores the
arch_counter_get_cntpct() function after removal in

0d651e4e "clocksource: arch_timer: use virtual counters"

We need this on certain ARMv7 systems which are architected like this:

* The firmware doesn't know and doesn't care about hypervisor mode and
  we don't want to add the complexity of hypervisor there.

* The firmware isn't involved in SMP bringup or resume.

* The ARCH timer come up with an uninitialized offset between the
  virtual and physical counters.  Each core gets a different random
  offset.

* The device boots in "Secure SVC" mode.

* Nothing has touched the reset value of CNTHCTL.PL1PCEN or
  CNTHCTL.PL1PCTEN (both default to 1 at reset)

One example of such as system is RK3288 where it is much simpler to
use the physical counter since there's nobody managing the offset and
each time a core goes down and comes back up it will get reinitialized
to some other random value.

Fixes: 0d651e4e65e9 ("clocksource: arch_timer: use virtual counters")
Cc: sta...@vger.kernel.org
Signed-off-by: Sonny Rao 
Acked-by: Olof Johansson 
---
v2: Add fixes tag to commit message, cc stable, copy Doug's
description of the systems which need this in commit message.
v3: Don't change the memory-mapped physical timer/counter code
---
 arch/arm/include/asm/arch_timer.h|  9 +
 arch/arm64/include/asm/arch_timer.h  | 10 ++
 drivers/clocksource/arm_arch_timer.c | 25 ++---
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/arch/arm/include/asm/arch_timer.h 
b/arch/arm/include/asm/arch_timer.h
index 0704e0c..e72aa4d 100644
--- a/arch/arm/include/asm/arch_timer.h
+++ b/arch/arm/include/asm/arch_timer.h
@@ -78,6 +78,15 @@ static inline u32 arch_timer_get_cntfrq(void)
return val;
 }
 
+static inline u64 arch_counter_get_cntpct(void)
+{
+   u64 cval;
+
+   isb();
+   asm volatile("mrrc p15, 0, %Q0, %R0, c14" : "=r" (cval));
+   return cval;
+}
+
 static inline u64 arch_counter_get_cntvct(void)
 {
u64 cval;
diff --git a/arch/arm64/include/asm/arch_timer.h 
b/arch/arm64/include/asm/arch_timer.h
index 9400596..58657c4 100644
--- a/arch/arm64/include/asm/arch_timer.h
+++ b/arch/arm64/include/asm/arch_timer.h
@@ -135,6 +135,16 @@ static inline void arch_timer_evtstrm_enable(int divider)
 #endif
 }
 
+static inline u64 arch_counter_get_cntpct(void)
+{
+   u64 cval;
+
+   isb();
+   asm volatile("mrs %0, cntpct_el0" : "=r" (cval));
+
+   return cval;
+}
+
 static inline u64 arch_counter_get_cntvct(void)
 {
u64 cval;
diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index 5163ec1..bd8da15 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -30,6 +30,8 @@
 #define CNTTIDR0x08
 #define CNTTIDR_VIRT(n)(BIT(1) << ((n) * 4))
 
+#define CNTPCT_LO  0x00
+#define CNTPCT_HI  0x04
 #define CNTVCT_LO  0x08
 #define CNTVCT_HI  0x0c
 #define CNTFRQ 0x10
@@ -386,6 +388,19 @@ static u64 arch_counter_get_cntvct_mem(void)
return ((u64) vct_hi << 32) | vct_lo;
 }
 
+static u64 arch_counter_get_cntpct_mem(void)
+{
+   u32 pct_lo, pct_hi, tmp_hi;
+
+   do {
+   pct_hi = readl_relaxed(arch_counter_base + CNTPCT_HI);
+   pct_lo = readl_relaxed(arch_counter_base + CNTPCT_LO);
+   tmp_hi = readl_relaxed(arch_counter_base + CNTPCT_HI);
+   } while (pct_hi != tmp_hi);
+
+   return ((u64) pct_hi << 32) | pct_lo;
+}
+
 /*
  * Default to cp15 based access because arm64 uses this function for
  * sched_clock() before DT is probed and the cp15 method is guaranteed
@@ -429,10 +444,14 @@ static void __init arch_counter_register(unsigned type)
u64 start_count;
 
/* Register the CP15 based counter if we have one */
-   if (type & ARCH_CP15_TIMER)
-   arch_timer_read_counter = arch_counter_get_cntvct;
-   else
+   if (type & ARCH_CP15_TIMER) {
+   if (arch_timer_use_virtual)
+   arch_timer_read_counter = arch_counter_get_cntvct;
+   else
+   arch_timer_read_counter = arch_counter_get_cntpct;
+   } else {
arch_timer_read_counter = arch_counter_get_cntvct_mem;
+   }
 
start_count = arch_timer_read_counter();
clocksource_register_hz(&clocksource_counter, arch_timer_rate);
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] mmc: dw_mmc: Reset DMA before enabling IDMAC

2014-10-06 Thread Sonny Rao

We've already got a reset of DMA after it's done.  Add one before we
start DMA too.  This fixes a data corruption on Rockchip SoCs which
will get bad data when doing a DMA transfer after doing a PIO transfer.

We tested this on an Exynos 5800 with HS200 and didn't notice any
difference in sequential read throughput.

Signed-off-by: Sonny Rao 
Signed-off-by: Doug Anderson 
---
 drivers/mmc/host/dw_mmc.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index 69f0cc6..2b5401e 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -83,6 +83,7 @@ struct idmac_desc {
 #endif /* CONFIG_MMC_DW_IDMAC */
 
 static bool dw_mci_reset(struct dw_mci *host);
+static bool dw_mci_ctrl_reset(struct dw_mci *host, u32 reset);
 
 #if defined(CONFIG_DEBUG_FS)
 static int dw_mci_req_show(struct seq_file *s, void *v)
@@ -448,6 +449,9 @@ static void dw_mci_idmac_start_dma(struct dw_mci *host, 
unsigned int sg_len)
 
dw_mci_translate_sglist(host, host->data, sg_len);
 
+   /* Make sure to reset DMA in case we did PIO before this */
+   dw_mci_ctrl_reset(host, SDMMC_CTRL_DMA_RESET);
+
/* Select IDMAC interface */
temp = mci_readl(host, CTRL);
temp |= SDMMC_CTRL_USE_IDMAC;
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2] clocksource: arch_timer: Fix code to use physical timers when requested

2014-09-29 Thread Sonny Rao

On Fri, Sep 26, 2014 at 2:47 AM, Mark Rutland  wrote:
> Hi Sonny,
>
> Apologies for the delay in replying, I'd hoped to cover this at Connect,
> but we didn't seem to get the time, and since I've been back in the UK
> it slipped my mind.

Hi Mark, no problem, thanks for following up.

>
> On Thu, Sep 11, 2014 at 11:18:15PM +0100, Sonny Rao wrote:
>> This is a bug fix for using physical arch timers when
>> the arch_timer_use_virtual boolean is false.  It restores the
>> arch_counter_get_cntpct() function after removal in
>>
>> 0d651e4e "clocksource: arch_timer: use virtual counters"
>
> Given we cannot get firmware involved, I am happy to use the physical
> cp15 counters when we can't guarantee the value of CNTVOFF.
>
>> and completes the implementation of memory mapped access for physical
>> timers, so if a system is trying to use physical timers, it will
>> function properly.
>
> I don't see why we need to change the MMIO timers. Those are global
> rather than per-cpu, aren't turned off when CPUs go down (or they'd be
> useless), and we only use a single frame, so I don't see why the value
> of the virtual offset should matter.
>
> Additionally, the CP15 and MMIO timers could be configured separately
> w.r.t. timer and counter access, and for the MMIO timers we can
> determine which we can access by reading a register.
>
> I do not think the selection of physical/virtual timers should be shared
> by the CP15 and MMIO timers.

Ok, I see what you're saying.  I'll remove the physical memory mapped
access code and re-post.

> Mark.
>
>>
>> We need this on certain ARMv7 systems which are architected like this:
>>
>> * The firmware doesn't know and doesn't care about hypervisor mode and
>>   we don't want to add the complexity of hypervisor there.
>>
>> * The firmware isn't involved in SMP bringup or resume.
>>
>> * The ARCH timer come up with an uninitialized offset between the
>>   virtual and physical counters.  Each core gets a different random
>>   offset.
>>
>> * The device boots in "Secure SVC" mode.
>>
>> * Nothing has touched the reset value of CNTHCTL.PL1PCEN or
>>   CNTHCTL.PL1PCTEN (both default to 1 at reset)
>>
>> One example of such as system is RK3288 where it is much simpler to
>> use the physical counter since there's nobody managing the offset and
>> each time a core goes down and comes back up it will get reinitialized
>> to some other random value.
>>
>> Fixes: 0d651e4e65e9 ("clocksource: arch_timer: use virtual counters")
>> Cc: sta...@vger.kernel.org
>> Signed-off-by: Sonny Rao 
>> Acked-by: Olof Johansson 
>> ---
>> v2: Add fixes tag to commit message, cc stable, copy Doug's
>> description of the systems which need this in commit message.
>> ---
>>  arch/arm/include/asm/arch_timer.h|  9 +
>>  arch/arm64/include/asm/arch_timer.h  | 10 ++
>>  drivers/clocksource/arm_arch_timer.c | 30 ++
>>  3 files changed, 45 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/arm/include/asm/arch_timer.h 
>> b/arch/arm/include/asm/arch_timer.h
>> index 0704e0c..e72aa4d 100644
>> --- a/arch/arm/include/asm/arch_timer.h
>> +++ b/arch/arm/include/asm/arch_timer.h
>> @@ -78,6 +78,15 @@ static inline u32 arch_timer_get_cntfrq(void)
>>   return val;
>>  }
>>
>> +static inline u64 arch_counter_get_cntpct(void)
>> +{
>> + u64 cval;
>> +
>> + isb();
>> + asm volatile("mrrc p15, 0, %Q0, %R0, c14" : "=r" (cval));
>> + return cval;
>> +}
>> +
>>  static inline u64 arch_counter_get_cntvct(void)
>>  {
>>   u64 cval;
>> diff --git a/arch/arm64/include/asm/arch_timer.h 
>> b/arch/arm64/include/asm/arch_timer.h
>> index 9400596..58657c4 100644
>> --- a/arch/arm64/include/asm/arch_timer.h
>> +++ b/arch/arm64/include/asm/arch_timer.h
>> @@ -135,6 +135,16 @@ static inline void arch_timer_evtstrm_enable(int 
>> divider)
>>  #endif
>>  }
>>
>> +static inline u64 arch_counter_get_cntpct(void)
>> +{
>> + u64 cval;
>> +
>> + isb();
>> + asm volatile("mrs %0, cntpct_el0" : "=r" (cval));
>> +
>> + return cval;
>> +}
>> +
>>  static inline u64 arch_counter_get_cntvct(void)
>>  {
>>   u64 cval;
>> diff --git a/drivers/clocksource/arm_arch_timer.c 
>> b/drivers/clocksource/arm_a

Re: [PATCH v3] clocksource: arch_timer: Allow the device tree to specify the physical timer

2014-09-29 Thread Sonny Rao

On Fri, Sep 26, 2014 at 3:00 AM, Mark Rutland  wrote:
> On Thu, Sep 11, 2014 at 06:00:01PM +0100, Doug Anderson wrote:
>> Some 32-bit (ARMv7) systems are architected like this:
>>
>> * The firmware doesn't know and doesn't care about hypervisor mode and
>>   we don't want to add the complexity of hypervisor there.
>>
>> * The firmware isn't involved in SMP bringup or resume.
>>
>> * The ARCH timer come up with an uninitialized offset between the
>>   virtual and physical counters.  Each core gets a different random
>>   offset.
>>
>> * The device boots in "Secure SVC" mode.
>>
>> * Nothing has touched the reset value of CNTHCTL.PL1PCEN or
>>   CNTHCTL.PL1PCTEN (both default to 1 at reset)
>>
>> On systems like the above, it doesn't make sense to use the virtual
>> counter.  There's nobody managing the offset and each time a core goes
>> down and comes back up it will get reinitialized to some other random
>> value.
>>
>> Let's add a property to the device tree to say that we shouldn't use
>> the virtual timer.  Firmware could potentially remove this property
>> before passing the device tree to the kernel if it really wants the
>> kernel to use a virtual timer.
>>
>> Note that it's been said that ARM64 (ARMv8) systems the firmware and
>> kernel really can't be architected as described above.  That means
>> using the physical timer like this really only makes sense for ARMv7
>> systems.
>>
>> In order for this patch to do anything useful, we also need Sonny's
>> patch at <https://patchwork.kernel.org/patch/4790921/>
>>
>> Signed-off-by: Doug Anderson 
>> Signed-off-by: Sonny Rao 
>> ---
>> Changes in v3:
>> - Wording changes to bindings and patch desc as per Will Deacon
>>
>> Changes in v2:
>> - Add "#ifdef CONFIG_ARM" as per Will Deacon
>>
>>  Documentation/devicetree/bindings/arm/arch_timer.txt | 6 ++
>>  drivers/clocksource/arm_arch_timer.c | 5 +
>>  2 files changed, 11 insertions(+)
>>
>> diff --git a/Documentation/devicetree/bindings/arm/arch_timer.txt 
>> b/Documentation/devicetree/bindings/arm/arch_timer.txt
>> index 37b2caf..e28fced 100644
>> --- a/Documentation/devicetree/bindings/arm/arch_timer.txt
>> +++ b/Documentation/devicetree/bindings/arm/arch_timer.txt
>> @@ -22,6 +22,12 @@ to deliver its interrupts via SPIs.
>>  - always-on : a boolean property. If present, the timer is powered through 
>> an
>>always-on power domain, therefore it never loses context.
>>
>> +** Optional properties:
>> +
>> +- arm,use-physical-timer : Don't ever use the virtual timer, just use the
>> +  physical one.  Only supported for ARM (not ARM64).
>
> I'm still not keen on telling the kernel what to do rather than
> describing the actual state of affairs and having the kernel decide what
> to do. Perhaps what we actually need is:
>
> - cntvoff-not-fw-configured: Firmware does not configure CNTVOFF, which
>   may reset to (different) arbitrary values on each CPU.
>
> This also doesn't describe that CNTHCTL.PL1PC(T)EN must both be 1. While
> that is the reset state, it still feels dodbgy to me to rely on that.
>
> Mark.

Mark, I'm happy to repost it with that name for Doug.

I think it's fair to describe this state in the binding, and if a
firmware were to put this property into the device-tree for and
CNTHCTL.PL1P(T)CEN also have configured to 0, then the kernel can
merely consider that to be a broken usage of this property.   We
certainly can't protect against all of the possible invalid states
caused and probably shouldn't try.  If we implement something like
Christopher's suggestion for transitioning from secure svc to NS hyp
mode then the kernel can simply ignore this property at that point.

>> +
>> +
>>  Example:
>>
>>   timer {
>> diff --git a/drivers/clocksource/arm_arch_timer.c 
>> b/drivers/clocksource/arm_arch_timer.c
>> index 5163ec1..e7aa256 100644
>> --- a/drivers/clocksource/arm_arch_timer.c
>> +++ b/drivers/clocksource/arm_arch_timer.c
>> @@ -649,6 +649,11 @@ static void __init arch_timer_init(struct device_node 
>> *np)
>>   arch_timer_ppi[i] = irq_of_parse_and_map(np, i);
>>   arch_timer_detect_rate(NULL, np);
>>
>> +#ifdef CONFIG_ARM
>> + if (of_property_read_bool(np, "arm,use-physical-timer"))
>> + arch_timer_use_virtual = false;
>> +#endif
>> +
>>   /*
>>* If HYP mode is available, we know that the physical timer
>>* has been configured to be accessible from PL1. Use it, so
>> --
>> 2.1.0.rc2.206.gedb03e5
>>
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC] arm: Handle starting up in secure mode

2014-09-19 Thread Sonny Rao

On Fri, Sep 19, 2014 at 6:30 AM, Catalin Marinas
 wrote:
> On Fri, Sep 19, 2014 at 02:22:10PM +0100, Christopher Covington wrote:
>> On 09/19/2014 01:56 AM, Peter Maydell wrote:
>> > On 17 September 2014 06:25, Christopher Covington  
>> > wrote:
>> >> On 09/16/2014 05:24 PM, Christopher Covington wrote:
>> >>> On 09/16/2014 05:09 PM, Christopher Covington wrote:
>>  ARM Linux currently has the most features available to it in hypervisor
>>  (HYP) mode, so switch to it when possible. This can also ensure proper
>>  reset of newer registers such as CNTVOFF.
>> 
>>  The permissions on the Non-Secure Access Control Register (NSACR) are
>>  used to probe what the security setting currently is when in supervisor
>>  (SVC) mode.
>> >>>
>> >>> Sorry, this doesn't work yet. I was misinterpreting my test results. For 
>> >>> what
>> >>> it's worth, my testing and development methodology is to run it after 
>> >>> hacked
>> >>> up versions of the semihosting bootwrapper on the simulator that 
>> >>> corresponds
>> >>> to rtsm_ve-aemv8a.dtb (AEM VE FVP these days?) and examine the 
>> >>> instruction traces.
>> >>
>> >> Looks like the real problem was that I was hacking up the bootwrapper
>> >> incorrectly--my start-in-secure-mode bootwrapper variant wasn't setting 
>> >> up the
>> >> GIC for non-secure access. With that changed, I've tested the following
>> >> variations using the Image file in a single core configuration.
>> >>
>> >> Start in non-secure SVC with non-secure access to GIC configured.
>> >>
>> >> Start in secure SVC with non-secure access to GIC configured.
>> >>
>> >> Start in secure SVC with non-secure access to GIC configured and 
>> >> hypervisor
>> >> support disabled in the model (-C cluster.has_el2=0). This required 
>> >> setting
>> >> the VBAR again in non-secure SVC but with that fix it seems to work. I'll
>> >> include this change in v2.
>> >
>> > If you're relying on the boot loader to set up the GIC to support
>> > non-secure access anyway, why not just have it boot the kernel in Hyp
>> > like the boot protocol document recommends? (The same thing as the GIC
>> > is going to apply for any other hardware that needs configuration to
>> > allow NS access; if we need the firmware to deal with this we might as
>> > well just have it boot us in the right mode too.)
>>
>> I'd like to get rid of as much of the bootwrapper as possible (having gotten
>> spoiled by using QEMU's built-in bootloader). I'm just taking it one step at 
>> a
>> time. Handling GIC initialization in the kernel is probably the next step.
>
> The problem is that the kernel doesn't know about GIC until much later.
> So I don't see an easy workaround, other than relying on the boot-loader
> to do the right thing (and then we go to the point Peter made about
> changing it to start Linux in Hyp mode directly).

Well, for us, the issue is that our boot-loader isn't involved in
secondary cpu startup, either at boot time nor suspend/resume or cpu
hotplug/power gating.
So we certainly could have the boot loader set up the GIC for
non-secure access and then this type of solution would work, though
I'm not sure what else might need to be set up for non-secure access.

>
> --
> Catalin
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC] arm: Handle starting up in secure mode

2014-09-17 Thread Sonny Rao

On Wed, Sep 17, 2014 at 6:25 AM, Christopher Covington
 wrote:
> On 09/16/2014 05:24 PM, Christopher Covington wrote:
>> On 09/16/2014 05:09 PM, Christopher Covington wrote:
>>> ARM Linux currently has the most features available to it in hypervisor
>>> (HYP) mode, so switch to it when possible. This can also ensure proper
>>> reset of newer registers such as CNTVOFF.
>>>
>>> The permissions on the Non-Secure Access Control Register (NSACR) are
>>> used to probe what the security setting currently is when in supervisor
>>> (SVC) mode.
>>
>> Sorry, this doesn't work yet. I was misinterpreting my test results. For what
>> it's worth, my testing and development methodology is to run it after hacked
>> up versions of the semihosting bootwrapper on the simulator that corresponds
>> to rtsm_ve-aemv8a.dtb (AEM VE FVP these days?) and examine the instruction 
>> traces.
>
> Looks like the real problem was that I was hacking up the bootwrapper
> incorrectly--my start-in-secure-mode bootwrapper variant wasn't setting up the
> GIC for non-secure access. With that changed, I've tested the following
> variations using the Image file in a single core configuration.
>
> Start in non-secure SVC with non-secure access to GIC configured.
>
> Start in secure SVC with non-secure access to GIC configured.

I tried this on my rk3288 which boots in secure SVC mode, but I must
be missing the GIC configuration for non-secure access because I get a
message like this:
[0.00] GIC CPU mask not found - kernel will fail to boot.
[0.00] GIC CPU mask not found - kernel will fail to boot.

and then it hangs here:
[0.116871] CPU0: thread -1, cpu 0, socket 5, mpidr 8500
[0.123274] Setting up static identity map for 0x5e3bf8 - 0x5e3c50

So I need to figure out how to enable non-secure access to the GIC work.

> Start in secure SVC with non-secure access to GIC configured and hypervisor
> support disabled in the model (-C cluster.has_el2=0). This required setting
> the VBAR again in non-secure SVC but with that fix it seems to work. I'll
> include this change in v2.
>
> I have not been able to start up the bootwrapper with secure monitor support
> disabled in the model (-C cluster.has_el3=0) because of faults during GIC
> configuration.
>
> So, any thoughts on the code? I have some questions.
>
> What initialization am I missing?
>
> What's the right ifdefery? I'm thinking a CONFIG_ARM_SEC_EXT that is
> independent of CONFIG_ARM_VIRT_EXT.
>
> Are there folks who want to run Linux in secure mode? Would documenting in
> Kconfig that they can do this by leaving CONFIG_ARM_SEC_EXT undefined be
> sufficient support for this?

We currently tend to run Linux in Secure SVC mode, and this has worked
fine for a long time, and I think this is because these chips have
reset defaults that allow us to access everything in secure mode, but
I don't know if we'd need to suddenly deal with having to configure
everything to also handle non-secure mode as well.

I our hope here is that if you know you're starting in secure svc mode
on armv7a, and have virtualization extensions, the kernel could
"elevate" itself to hyp mode, without too much work, but it looks like
on (at least) rk3288 we need to do something to the GIC at a
minimum.

>
> Is it fine or bad to leave the CPU in undefined mode when one of the extension
> or privileged operations fails? It's only for maybe a dozen instructions until
> safe_svcmode_maskall.
>
> How should CPU mode recording be handled? Should the CONFIG_ARM_SEC_EXT code
> set __boot_cpu_mode with an extra bit specifying the security setting and then
> the CONFIG_ARM_VIRT_EXT code set-if-unset?
>
> What should the secure monitor stub leave behind? Right now it's different
> than the hypervisor stub: put the address of the code you want run in monitor
> mode into r4, make the secure monitor call, and make sure your code makes an
> exception return when finished.
>
> Whatever is left behind, I think it should allow the GIC driver to enable
> non-secure access to the hardware if Linux was booted in secure mode.
>
> If this code can be evolved and tested to the point where it's mergeable, then
> Linux can finally interact intelligently with the features introduced by the
> security extensions and only has to depend on firmware and bootloaders getting
> the implementation defined aspects of a system right, taking care of the
> architectually specified stuff itself when sufficiently privileged. Among
> other things, this should lead towards the bootwrapper no longer being
> required to boot Linux on ARM's virtual platforms, which would make my life
> easier.

This would be great for us, because we tend to have our firmware do
the bare minimum possible, and this would allow people running
upstream kernels on ChromeOS systems to easily use virtualization if
they wish.

>
> Thanks,
> Christopher
>
> --
> Employee of Qualcomm Innovation Center, Inc.
> Qualcomm Innovation Center, Inc. is a member of Code Aurora Foru

Re: [PATCH v2 2/3] ARM: rockchip: add basic smp support for rk3288

2014-09-16 Thread Sonny Rao

On Tue, Sep 16, 2014 at 3:44 AM, Kever Yang  wrote:
> This patch add basic rk3288 smp support, cpu 1~3 are in wfe state
> when get into kernel.
>
> Signed-off-by: Heiko Stuebner 
> Signed-off-by: Kever Yang 
> ---
>
> Changes in v2:
> - use rk3288_boot_secondary instead ofsmp_boot_secondary
> - discards the power domain operation
> - handle the per cpu starup when actived by 'sev'
>
>  arch/arm/mach-rockchip/core.h|  1 +
>  arch/arm/mach-rockchip/headsmp.S | 14 +
>  arch/arm/mach-rockchip/platsmp.c | 63 
> 
>  3 files changed, 72 insertions(+), 6 deletions(-)
>
> diff --git a/arch/arm/mach-rockchip/core.h b/arch/arm/mach-rockchip/core.h
> index 39bca96..13de05a 100644
> --- a/arch/arm/mach-rockchip/core.h
> +++ b/arch/arm/mach-rockchip/core.h
> @@ -18,3 +18,4 @@ extern char rockchip_secondary_trampoline_end;
>
>  extern unsigned long rockchip_boot_fn;
>  extern void rockchip_secondary_startup(void);
> +extern void rk3288_secondary_startup(void);
> diff --git a/arch/arm/mach-rockchip/headsmp.S 
> b/arch/arm/mach-rockchip/headsmp.S
> index 73206e3..bacdb56 100644
> --- a/arch/arm/mach-rockchip/headsmp.S
> +++ b/arch/arm/mach-rockchip/headsmp.S
> @@ -20,6 +20,20 @@ ENTRY(rockchip_secondary_startup)
> b   secondary_startup
>  ENDPROC(rockchip_secondary_startup)
>
> +ENTRY(rk3288_secondary_startup)
> +   mrc p15, 0, r0, c0, c0, 5
> +   mov r2, #3
> +   and r0, r0, r2
> +   ldr r1, =0xff70
> +   ldr r1, [r1]
> +   cmp r0, r1
> +   beq 2f
> +   ldr r2, =0xfffd
> +   mov pc, r2
> +2:
> +   b   secondary_startup
> +ENDPROC(rk3288_secondary_startup)

Comments on what's going on here would be nice.
It looks like what you're doing is checking to see whether this CPU is
the one that is supposed to wake up or not, but looking at sram
(written by the C code below), and this isn't the correct CPU then go
back to boot rom?  Since we read in int-mem from the device-tree it
would also probably be good to make that ldr r1, 0xff70 reference
the sram_base_addr variable rather than hard coding here.

> +
>  ENTRY(rockchip_secondary_trampoline)
> ldr pc, 1f
>  ENDPROC(rockchip_secondary_trampoline)
> diff --git a/arch/arm/mach-rockchip/platsmp.c 
> b/arch/arm/mach-rockchip/platsmp.c
> index 189684f..022a01d 100644
> --- a/arch/arm/mach-rockchip/platsmp.c
> +++ b/arch/arm/mach-rockchip/platsmp.c
> @@ -60,7 +60,7 @@ static void pmu_set_power_domain(int pd, bool on)
>   * Handling of CPU cores
>   */
>
> -static int __cpuinit rockchip_boot_secondary(unsigned int cpu,
> +static int __cpuinit rk3066_boot_secondary(unsigned int cpu,
>  struct task_struct *idle)
>  {
> if (!sram_base_addr || !pmu_base_addr) {
> @@ -80,6 +80,28 @@ static int __cpuinit rockchip_boot_secondary(unsigned int 
> cpu,
> return 0;
>  }
>
> +static int __cpuinit rk3288_boot_secondary(unsigned int cpu,
> +struct task_struct *idle)
> +{
> +   if (!sram_base_addr) {
> +   pr_err("%s: sram missing for cpu boot\n", __func__);
> +   return -ENXIO;
> +   }
> +
> +   if (cpu >= ncores) {
> +   pr_err("%s: cpu %d outside maximum number of cpus %d\n",
> +   __func__, cpu, 
> ncores);
> +   return -ENXIO;
> +   }
> +   /* start the core */
> +   writel(virt_to_phys(rk3288_secondary_startup), sram_base_addr + 8);
> +   writel(0xDEADBEAF, sram_base_addr + 4);
> +   writel(cpu, sram_base_addr + 0);

I guess the boot rom is looking for 0xDEADBEEF here at sram_base_addr
+ 4, but the cpu itself will be looking at sram_base_addr + 0.
Again, might be good to somehow document what the protocol is.

> +   dsb_sev();
> +
> +   return 0;
> +}
> +
>  /**
>   * rockchip_smp_prepare_sram - populate necessary sram block
>   * Starting cores execute the code residing at the start of the on-chip sram
> @@ -125,7 +147,7 @@ static int __init rockchip_smp_prepare_sram(struct 
> device_node *node)
> return 0;
>  }
>
> -static void __init rockchip_smp_prepare_cpus(unsigned int max_cpus)
> +static void __init rk3066_smp_prepare_cpus(unsigned int max_cpus)
>  {
> struct device_node *node;
> unsigned int i;
> @@ -194,12 +216,41 @@ static void rockchip_cpu_die(unsigned int cpu)
>  }
>  #endif
>
> -static struct smp_operations rockchip_smp_ops __initdata = {
> -   .smp_prepare_cpus   = rockchip_smp_prepare_cpus,
> -   .smp_boot_secondary = rockchip_boot_secondary,
> +static void __init rk3288_smp_prepare_cpus(unsigned int max_cpus)
> +{
> +   struct device_node *node;
> +
> +   node = of_find_compatible_node(NULL, NULL, 
> "rockchip,rk3066-smp-sram");
> +   if (!node) {
> +   pr_err("%s: could not find sram dt node\n", __func__);
> +   return;
> +   }
>

Re: [PATCH v2] clocksource: arch_timer: Allow the device tree to specify the physical timer

2014-09-15 Thread Sonny Rao

On Mon, Sep 15, 2014 at 3:51 PM, Christopher Covington
 wrote:
> Hi Sonny,
>
> On 09/15/2014 06:04 PM, Sonny Rao wrote:
>> On Mon, Sep 15, 2014 at 2:52 PM, Sonny Rao  wrote:
>>> On Mon, Sep 15, 2014 at 2:49 PM, Stephen Boyd  wrote:
>>>> On 09/15/14 14:47, Sonny Rao wrote:
>>>>> On Mon, Sep 15, 2014 at 1:33 PM, Stephen Boyd  
>>>>> wrote:
>>>>>> On 09/15/14 04:10, Catalin Marinas wrote:
>>>>>>> On Fri, Sep 12, 2014 at 07:59:29PM +0100, Stephen Boyd wrote:
>>>>>>>> On 09/12/14 05:14, Marc Zyngier wrote:
>>>>>>>>> We surely can handle the UNDEF and do something there. We just can't 
>>>>>>>>> do
>>>>>>>>> it the way Doug described it above.
>>>>>>>> I suggested doing that for something else a while ago and Will and Dave
>>>>>>>> we're not thrilled[1]. The suggestion back then was to use DT to
>>>>>>>> indicate what mode the kernel is running in.
>>>>>>>>
>>>>>>>> [1]
>>>>>>>> http://lists.infradead.org/pipermail/linux-arm-kernel/2012-June/105321.html
>>>>>>> I think the context was slightly different. As I re-read the thread, it
>>>>>>> seems that the discussion was around whether to use some SMC interface
>>>>>>> or not based on whether the kernel is running secure or non-secure. The
>>>>>>> argument made by Will was to actually specify the type of the firmware
>>>>>>> SMC interface in the DT and use it in the kernel (and probably assume
>>>>>>> the kernel is running in secure mode if no smc interface is specified in
>>>>>>> the DT; you could have both though, running in secure mode and also
>>>>>>> having firmware).
>>>>>>>
>>>>>>> In this arch timer case, we need to work around a firmware bug (or
>>>>>>> feature as 32-bit ARM kernels never required CNTVOFF initialisation by
>>>>>>> firmware, no matter how small such firmware is). We don't expect a
>>>>>>> specific SMC call to initialise CNTVOFF, so we can't describe it in the
>>>>>>> DT.
>>>>>> Agreed, we can't described SMC calls that don't exist. From my
>>>>>> perspective it's just another part of the cpu boot sequence that needs
>>>>>> to be handled in the kernel, so describing the requirement via the
>>>>>> cpu-boot method seems appropriate. It seems like we're making it harder
>>>>>> than it should be by handling the undef when we could have slightly
>>>>>> different SMP boot code (and suspend/resume code) depending on the boot
>>>>>> method property.
>>>>>
>>>>> +heiko
>>>>>
>>>>> So, for the case of rk3288, based on this discussion what I'm going to
>>>>> propose is to add code to rockchip.c which looks for a particular SMP
>>>>> enable method -- say something like "rockchip,rk3288-smp-secure-svc"
>>>>> which will then assume we have been booted in secure SVC mode and do
>>>>> the CNTVOFF fixup.  I believe, it will need to do this on the boot CPU
>>>>> as well, so I think it will need to scan the DT fairly early on the
>>>>> boot CPU and also perform the function there.
>>>>>
>>>>> I'll look into implementing this and post code.  Comments and
>>>>> suggestions appreciated, thanks.
>>>>
>>>> What goes wrong if we read the cntvoff from the boot CPU during
>>>> smp_prepare_cpus() phase and use that to set the cntvoff on the other
>>>> CPUs? That avoids needing to do anything very early by making the value
>>>> the same. It does mean that cntvoff is some random out of reset value
>>>> for CPU0, but at least it's consistent.
>>>
>>> I think we cannot read the value if we're not in hyp mode.
>>
>> Well, thinking about it a little more, I think you still have a good point.
>>
>> We don't need to do this early on, as long as we haven't started using
>> the arch timers yet.  If we are still able to do this at the point
>> where we're executing the code in arch/arm/mach-rockchip/platsmp.c
>> that finds the enable method then we can just handle it there.
>
> I've been playing around with the probe-based approach and while I need to do
> a lot more testing, it seems to be working for the first tens of instructions.
> I hope to be able to share a draft of that soon. Basically, I just read the
> current NSACR value and write it back (although maybe in the long term we
> would want to make sure a few of those bits are set or cleared). If that
> succeeds, we know we're in secure SVC and can proceed to set up MON and HYP.

Christopher, sounds promising, please do share, thanks!

Marc or Will, what do you guys think about this approach?


>
> Christopher
>
> --
> Employee of Qualcomm Innovation Center, Inc.
> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
> hosted by the Linux Foundation.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2] clocksource: arch_timer: Allow the device tree to specify the physical timer

2014-09-15 Thread Sonny Rao

On Mon, Sep 15, 2014 at 2:52 PM, Sonny Rao  wrote:
> On Mon, Sep 15, 2014 at 2:49 PM, Stephen Boyd  wrote:
>> On 09/15/14 14:47, Sonny Rao wrote:
>>> On Mon, Sep 15, 2014 at 1:33 PM, Stephen Boyd  wrote:
>>>> On 09/15/14 04:10, Catalin Marinas wrote:
>>>>> On Fri, Sep 12, 2014 at 07:59:29PM +0100, Stephen Boyd wrote:
>>>>>> On 09/12/14 05:14, Marc Zyngier wrote:
>>>>>>> We surely can handle the UNDEF and do something there. We just can't do
>>>>>>> it the way Doug described it above.
>>>>>> I suggested doing that for something else a while ago and Will and Dave
>>>>>> we're not thrilled[1]. The suggestion back then was to use DT to
>>>>>> indicate what mode the kernel is running in.
>>>>>>
>>>>>> [1]
>>>>>> http://lists.infradead.org/pipermail/linux-arm-kernel/2012-June/105321.html
>>>>> I think the context was slightly different. As I re-read the thread, it
>>>>> seems that the discussion was around whether to use some SMC interface
>>>>> or not based on whether the kernel is running secure or non-secure. The
>>>>> argument made by Will was to actually specify the type of the firmware
>>>>> SMC interface in the DT and use it in the kernel (and probably assume
>>>>> the kernel is running in secure mode if no smc interface is specified in
>>>>> the DT; you could have both though, running in secure mode and also
>>>>> having firmware).
>>>>>
>>>>> In this arch timer case, we need to work around a firmware bug (or
>>>>> feature as 32-bit ARM kernels never required CNTVOFF initialisation by
>>>>> firmware, no matter how small such firmware is). We don't expect a
>>>>> specific SMC call to initialise CNTVOFF, so we can't describe it in the
>>>>> DT.
>>>> Agreed, we can't described SMC calls that don't exist. From my
>>>> perspective it's just another part of the cpu boot sequence that needs
>>>> to be handled in the kernel, so describing the requirement via the
>>>> cpu-boot method seems appropriate. It seems like we're making it harder
>>>> than it should be by handling the undef when we could have slightly
>>>> different SMP boot code (and suspend/resume code) depending on the boot
>>>> method property.
>>>
>>> +heiko
>>>
>>> So, for the case of rk3288, based on this discussion what I'm going to
>>> propose is to add code to rockchip.c which looks for a particular SMP
>>> enable method -- say something like "rockchip,rk3288-smp-secure-svc"
>>> which will then assume we have been booted in secure SVC mode and do
>>> the CNTVOFF fixup.  I believe, it will need to do this on the boot CPU
>>> as well, so I think it will need to scan the DT fairly early on the
>>> boot CPU and also perform the function there.
>>>
>>> I'll look into implementing this and post code.  Comments and
>>> suggestions appreciated, thanks.
>>
>> What goes wrong if we read the cntvoff from the boot CPU during
>> smp_prepare_cpus() phase and use that to set the cntvoff on the other
>> CPUs? That avoids needing to do anything very early by making the value
>> the same. It does mean that cntvoff is some random out of reset value
>> for CPU0, but at least it's consistent.
>
> I think we cannot read the value if we're not in hyp mode.

Well, thinking about it a little more, I think you still have a good point.

We don't need to do this early on, as long as we haven't started using
the arch timers yet.  If we are still able to do this at the point
where we're executing the code in arch/arm/mach-rockchip/platsmp.c
that finds the enable method then we can just handle it there.

>
>
>>
>> --
>> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
>> hosted by The Linux Foundation
>>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2] clocksource: arch_timer: Allow the device tree to specify the physical timer

2014-09-15 Thread Sonny Rao

On Mon, Sep 15, 2014 at 2:49 PM, Stephen Boyd  wrote:
> On 09/15/14 14:47, Sonny Rao wrote:
>> On Mon, Sep 15, 2014 at 1:33 PM, Stephen Boyd  wrote:
>>> On 09/15/14 04:10, Catalin Marinas wrote:
>>>> On Fri, Sep 12, 2014 at 07:59:29PM +0100, Stephen Boyd wrote:
>>>>> On 09/12/14 05:14, Marc Zyngier wrote:
>>>>>> We surely can handle the UNDEF and do something there. We just can't do
>>>>>> it the way Doug described it above.
>>>>> I suggested doing that for something else a while ago and Will and Dave
>>>>> we're not thrilled[1]. The suggestion back then was to use DT to
>>>>> indicate what mode the kernel is running in.
>>>>>
>>>>> [1]
>>>>> http://lists.infradead.org/pipermail/linux-arm-kernel/2012-June/105321.html
>>>> I think the context was slightly different. As I re-read the thread, it
>>>> seems that the discussion was around whether to use some SMC interface
>>>> or not based on whether the kernel is running secure or non-secure. The
>>>> argument made by Will was to actually specify the type of the firmware
>>>> SMC interface in the DT and use it in the kernel (and probably assume
>>>> the kernel is running in secure mode if no smc interface is specified in
>>>> the DT; you could have both though, running in secure mode and also
>>>> having firmware).
>>>>
>>>> In this arch timer case, we need to work around a firmware bug (or
>>>> feature as 32-bit ARM kernels never required CNTVOFF initialisation by
>>>> firmware, no matter how small such firmware is). We don't expect a
>>>> specific SMC call to initialise CNTVOFF, so we can't describe it in the
>>>> DT.
>>> Agreed, we can't described SMC calls that don't exist. From my
>>> perspective it's just another part of the cpu boot sequence that needs
>>> to be handled in the kernel, so describing the requirement via the
>>> cpu-boot method seems appropriate. It seems like we're making it harder
>>> than it should be by handling the undef when we could have slightly
>>> different SMP boot code (and suspend/resume code) depending on the boot
>>> method property.
>>
>> +heiko
>>
>> So, for the case of rk3288, based on this discussion what I'm going to
>> propose is to add code to rockchip.c which looks for a particular SMP
>> enable method -- say something like "rockchip,rk3288-smp-secure-svc"
>> which will then assume we have been booted in secure SVC mode and do
>> the CNTVOFF fixup.  I believe, it will need to do this on the boot CPU
>> as well, so I think it will need to scan the DT fairly early on the
>> boot CPU and also perform the function there.
>>
>> I'll look into implementing this and post code.  Comments and
>> suggestions appreciated, thanks.
>
> What goes wrong if we read the cntvoff from the boot CPU during
> smp_prepare_cpus() phase and use that to set the cntvoff on the other
> CPUs? That avoids needing to do anything very early by making the value
> the same. It does mean that cntvoff is some random out of reset value
> for CPU0, but at least it's consistent.

I think we cannot read the value if we're not in hyp mode.


>
> --
> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
> hosted by The Linux Foundation
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2] clocksource: arch_timer: Allow the device tree to specify the physical timer

2014-09-15 Thread Sonny Rao

On Mon, Sep 15, 2014 at 1:33 PM, Stephen Boyd  wrote:
>
> On 09/15/14 04:10, Catalin Marinas wrote:
> > On Fri, Sep 12, 2014 at 07:59:29PM +0100, Stephen Boyd wrote:
> >> On 09/12/14 05:14, Marc Zyngier wrote:
> >>> We surely can handle the UNDEF and do something there. We just can't do
> >>> it the way Doug described it above.
> >> I suggested doing that for something else a while ago and Will and Dave
> >> we're not thrilled[1]. The suggestion back then was to use DT to
> >> indicate what mode the kernel is running in.
> >>
> >> [1]
> >> http://lists.infradead.org/pipermail/linux-arm-kernel/2012-June/105321.html
> > I think the context was slightly different. As I re-read the thread, it
> > seems that the discussion was around whether to use some SMC interface
> > or not based on whether the kernel is running secure or non-secure. The
> > argument made by Will was to actually specify the type of the firmware
> > SMC interface in the DT and use it in the kernel (and probably assume
> > the kernel is running in secure mode if no smc interface is specified in
> > the DT; you could have both though, running in secure mode and also
> > having firmware).
> >
> > In this arch timer case, we need to work around a firmware bug (or
> > feature as 32-bit ARM kernels never required CNTVOFF initialisation by
> > firmware, no matter how small such firmware is). We don't expect a
> > specific SMC call to initialise CNTVOFF, so we can't describe it in the
> > DT.
>
> Agreed, we can't described SMC calls that don't exist. From my
> perspective it's just another part of the cpu boot sequence that needs
> to be handled in the kernel, so describing the requirement via the
> cpu-boot method seems appropriate. It seems like we're making it harder
> than it should be by handling the undef when we could have slightly
> different SMP boot code (and suspend/resume code) depending on the boot
> method property.


+heiko

So, for the case of rk3288, based on this discussion what I'm going to
propose is to add code to rockchip.c which looks for a particular SMP
enable method -- say something like "rockchip,rk3288-smp-secure-svc"
which will then assume we have been booted in secure SVC mode and do
the CNTVOFF fixup.  I believe, it will need to do this on the boot CPU
as well, so I think it will need to scan the DT fairly early on the
boot CPU and also perform the function there.

I'll look into implementing this and post code.  Comments and
suggestions appreciated, thanks.


>
>
> --
> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
> hosted by The Linux Foundation
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2] clocksource: arch_timer: Allow the device tree to specify the physical timer

2014-09-11 Thread Sonny Rao

On Thu, Sep 11, 2014 at 6:17 PM, Stephen Boyd  wrote:
> On 09/11/14 17:14, Sonny Rao wrote:
>
> On Thu, Sep 11, 2014 at 4:56 PM, Stephen Boyd  wrote:
>>
>>
>> Where does this platform jump to when a CPU comes up? Is it
>> rockchip_secondary_startup()? I wonder if that path could have this
>> little bit of assembly to poke the cntvoff in monitor mode and then jump
>> to secondary_startup()? Before we boot any secondary CPUs we could also
>> read the cntvoff for CPU0 in the platform specific layer (where we know
>> we're running in secure mode) and then use that value as the "reset"
>> value for the secondaries. Or does this platform boot up in secure mode
>> some times and non-secure mode other times?
>
>
> Yes, In our case, with our firmware, we will go through some internal Rom
> code and then jump to rockchip_secondary_startup, but I don't think it's
> correct to force all users of this SoC to do it that way.
>
>
> What's being forced? The way internal rom jumps to sram? Is there any other
> way that secondary CPUs come out of reset on this SoC? From looking at the
> code it seems like the only path is internal rom jumps to sram (where
> rockchip_secondary_trampoline lives) which jumps to
> rockchip_secondary_startup() which then does an invalidate and jump to
> secondary_startup(). Linux controls everything besides the internal rom. Is
> something different in your case?


There are other ways it can be done, and I don't know all of the
possibilities, but there seems to be some protocol with the iROM that
tells it where to go, which the current SMP patches are using by
putting a magic number and an address in SRAM.  I think it's true that
in our case, it really is pretty simple and we have secure SVC mode
and not much else runs (besides the iROM).

Since I don't know all of the possibilities, I didn't want to preclude
the possibility that someone else handled things differently and
entered the kernel in non-secure mode, and have some code there that
broke in that instance, that's all I meant by "forced".

>
>  If there were a reasonable way to determine for sure that we are in secure
> mode, then yes we could do what you're suggesting, and I'd be happy to code
> that up.
>
>
> I think the problem is that there isn't a great way to determine whether
> we're in secure mode or not, and this is maybe by design?  I don't
> particularly understand that design choice.  It would be nice to hear some
> rationale from ARM folks.
>
>
> I'm thinking we would have a different boot-method for secure vs. non-secure
> and then we would know to configure cntvoff or not based on the boot method.
> Isn't that a reasonable way of knowing what should be done? It seems like we
> can at least modify the DT for this SoC.

Putting something into the device-tree is in fact the point of this
patch, so it is sort of doing what you're suggesting, although this
patch is about being able use to physical counters and doesn't
indicate anything about secure vs non-secure.  What else do you think
could be used to differentiate between the two cases, besides putting
it into the DT?

>
> I still wonder if there is such a bootloader/hypervisor/rom that's putting
> this SoC into non-secure mode and not configuring cntvoff. Doug's comments
> seem to suggest that the whole world would be different if this were true.
> Maybe Heiko knows?

As far as I'm aware, there's no bootloader/firmware that's ever
putting the CPU into non-secure mode for our case.

> --
> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
> hosted by The Linux Foundation
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2] clocksource: arch_timer: Fix code to use physical timers when requested

2014-09-11 Thread Sonny Rao

This is a bug fix for using physical arch timers when
the arch_timer_use_virtual boolean is false.  It restores the
arch_counter_get_cntpct() function after removal in

0d651e4e "clocksource: arch_timer: use virtual counters"

and completes the implementation of memory mapped access for physical
timers, so if a system is trying to use physical timers, it will
function properly.

We need this on certain ARMv7 systems which are architected like this:

* The firmware doesn't know and doesn't care about hypervisor mode and
  we don't want to add the complexity of hypervisor there.

* The firmware isn't involved in SMP bringup or resume.

* The ARCH timer come up with an uninitialized offset between the
  virtual and physical counters.  Each core gets a different random
  offset.

* The device boots in "Secure SVC" mode.

* Nothing has touched the reset value of CNTHCTL.PL1PCEN or
  CNTHCTL.PL1PCTEN (both default to 1 at reset)

One example of such as system is RK3288 where it is much simpler to
use the physical counter since there's nobody managing the offset and
each time a core goes down and comes back up it will get reinitialized
to some other random value.

Fixes: 0d651e4e65e9 ("clocksource: arch_timer: use virtual counters")
Cc: sta...@vger.kernel.org
Signed-off-by: Sonny Rao 
Acked-by: Olof Johansson 
---
v2: Add fixes tag to commit message, cc stable, copy Doug's
description of the systems which need this in commit message.
---
 arch/arm/include/asm/arch_timer.h|  9 +
 arch/arm64/include/asm/arch_timer.h  | 10 ++
 drivers/clocksource/arm_arch_timer.c | 30 ++
 3 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/arch/arm/include/asm/arch_timer.h 
b/arch/arm/include/asm/arch_timer.h
index 0704e0c..e72aa4d 100644
--- a/arch/arm/include/asm/arch_timer.h
+++ b/arch/arm/include/asm/arch_timer.h
@@ -78,6 +78,15 @@ static inline u32 arch_timer_get_cntfrq(void)
return val;
 }
 
+static inline u64 arch_counter_get_cntpct(void)
+{
+   u64 cval;
+
+   isb();
+   asm volatile("mrrc p15, 0, %Q0, %R0, c14" : "=r" (cval));
+   return cval;
+}
+
 static inline u64 arch_counter_get_cntvct(void)
 {
u64 cval;
diff --git a/arch/arm64/include/asm/arch_timer.h 
b/arch/arm64/include/asm/arch_timer.h
index 9400596..58657c4 100644
--- a/arch/arm64/include/asm/arch_timer.h
+++ b/arch/arm64/include/asm/arch_timer.h
@@ -135,6 +135,16 @@ static inline void arch_timer_evtstrm_enable(int divider)
 #endif
 }
 
+static inline u64 arch_counter_get_cntpct(void)
+{
+   u64 cval;
+
+   isb();
+   asm volatile("mrs %0, cntpct_el0" : "=r" (cval));
+
+   return cval;
+}
+
 static inline u64 arch_counter_get_cntvct(void)
 {
u64 cval;
diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index 5163ec1..ad723cb 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -30,6 +30,8 @@
 #define CNTTIDR0x08
 #define CNTTIDR_VIRT(n)(BIT(1) << ((n) * 4))
 
+#define CNTPCT_LO  0x00
+#define CNTPCT_HI  0x04
 #define CNTVCT_LO  0x08
 #define CNTVCT_HI  0x0c
 #define CNTFRQ 0x10
@@ -386,6 +388,19 @@ static u64 arch_counter_get_cntvct_mem(void)
return ((u64) vct_hi << 32) | vct_lo;
 }
 
+static u64 arch_counter_get_cntpct_mem(void)
+{
+   u32 pct_lo, pct_hi, tmp_hi;
+
+   do {
+   pct_hi = readl_relaxed(arch_counter_base + CNTPCT_HI);
+   pct_lo = readl_relaxed(arch_counter_base + CNTPCT_LO);
+   tmp_hi = readl_relaxed(arch_counter_base + CNTPCT_HI);
+   } while (pct_hi != tmp_hi);
+
+   return ((u64) pct_hi << 32) | pct_lo;
+}
+
 /*
  * Default to cp15 based access because arm64 uses this function for
  * sched_clock() before DT is probed and the cp15 method is guaranteed
@@ -429,10 +444,17 @@ static void __init arch_counter_register(unsigned type)
u64 start_count;
 
/* Register the CP15 based counter if we have one */
-   if (type & ARCH_CP15_TIMER)
-   arch_timer_read_counter = arch_counter_get_cntvct;
-   else
-   arch_timer_read_counter = arch_counter_get_cntvct_mem;
+   if (type & ARCH_CP15_TIMER) {
+   if (arch_timer_use_virtual)
+   arch_timer_read_counter = arch_counter_get_cntvct;
+   else
+   arch_timer_read_counter = arch_counter_get_cntpct;
+   } else {
+   if (arch_timer_use_virtual)
+   arch_timer_read_counter = arch_counter_get_cntvct_mem;
+   else
+   arch_timer_read_counter = arch_counter_get_cntpct_mem;
+   }
 
start_count = arch_timer_read_counter();
clocksource_register_hz(&clocksource_counter, arch_timer

Re: [PATCH] clocksource: arch_timer: Fix code to use physical timers when requested

2014-09-10 Thread Sonny Rao

On Wed, Sep 10, 2014 at 10:52 AM, Doug Anderson  wrote:
> Mark,
>
> On Wed, Sep 10, 2014 at 10:27 AM, Mark Rutland  wrote:
>> Hi Sonny,
>>
>> On Wed, Aug 27, 2014 at 10:03:39PM +0100, Sonny Rao wrote:
>>> This is a bug fix for using physical arch timers when
>>> the arch_timer_use_virtual boolean is false.  It restores the
>>> arch_counter_get_cntpct() function after removal in
>>>
>>> 0d651e4e "clocksource: arch_timer: use virtual counters"
>>>
>>> and completes the implementation of memory mapped access for physical
>>> timers, so if a system is trying to use physical timers, it will
>>> function properly.
>>
>> To get back to the topic at hand:
>>
>> Which platform is this required by?
>
> I've seen similar problems on the A7s on exynos5420 / exynos5800 and
> on rk3288.  I can't say what other platforms might be affected.  Note
> that the arch timers on exyno5420/exynos5800 are not supported anyway,
> so I guess that means we're just worrying about the rk3288.

Yeah, given that this problem has manifest on at least two different
SoCs using ARM's cores, it would have been nice if the offset were
specified to start out as zero when in reset by the architecture (and
was implemented that way in ARM's core IP), but it looks like it
wasn't.

>
>
>> Why exactly is arch_timer_use_virtual false in this case?
>
> To re-summarize my understanding of everything (many of the below is
> secondhand knowledge, so correct if wrong):
>
> 1. The initial problem is that the virtual offset is not initialized
> by anyone and is per core (each core gets a different, random offset).
> That makes the virtual counter useless.  ...but the kernel only uses
> virtual counters.
>
> 2. As far as I know, we don't have any particular need for HYP mode
> nor for limiting access to secure mode.
>
> 3. You can only change the virtual offset from HYP mode.  That means
> someone needs to transition to HYP mode if we want to use virtual
> counters.
>
> 4. If the kernel happens to be in HYP mode it will init the virtual offset.
>
> 5. We could transition to HYP mode once in the firmware and boot the
> kernel like that, but since firmware is gone after we've booted the
> kernel, we run into the same problem when we power off a processor and
> when we resume from S3.  The firmware is not involved in these cases.
> In these cases the processors have an uninitialized virtual offset
> again.  These processors don't appear to magically come up in HYP
> mode.  Thus it would be up to the kernel to transition to HYP mode,
> init the offset, and get out of HYP mode.  ...or just use the physical
> counter.
>
>
> If you can suggest something that doesn't require us to involve the
> firmware in processor bringup and in resume, I'm all ears.  We have a
> desire not to involve the firmware there because of all the issues of
> keeping kernel/firmware in sync and because of the extra difficultly
> of shipping firmware updates (understandably the QA needed to validate
> a new firmware is much higher than the QA needed to validate a new
> kernel).

One thing that was used in the past was to have the kernel load a blob
from /lib/firmware/ which did some re-initialization when coming out
of suspend or deep sleep.  We could do something similar here and have
it either fix virtual offset or put it into hyp mode.  That would help
solve our issue of making it easier to update and avoid QA hassles.
Is such a solution acceptable to upstream?


>
>
> -Doug
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] clocksource: arch_timer: Fix code to use physical timers when requested

2014-09-04 Thread Sonny Rao

On Fri, Aug 29, 2014 at 3:04 AM, Mark Rutland  wrote:
> On Fri, Aug 29, 2014 at 01:10:49AM +0100, Sonny Rao wrote:
>> On Thu, Aug 28, 2014 at 2:35 AM, Mark Rutland  wrote:
>> > On Thu, Aug 28, 2014 at 04:33:31AM +0100, Doug Anderson wrote:
>> >> Hi,
>> >>
>> >> On Wed, Aug 27, 2014 at 7:58 PM, Olof Johansson  wrote:
>> >> > On Wed, Aug 27, 2014 at 5:56 PM, Stephen Boyd  
>> >> > wrote:
>> >> >> On 08/27/14 15:33, Olof Johansson wrote:
>> >> >>> On Wed, Aug 27, 2014 at 3:26 PM, Stephen Boyd  
>> >> >>> wrote:
>> >> >>>
>> >> >>>> Is there any reason why the virtual counter can't be read? Maybe 
>> >> >>>> we're
>> >> >>>> the hyp and we need to make sure we don't use the virtual timer so 
>> >> >>>> that
>> >> >>>> the guest can use it, but that doesn't have any effect on the usage 
>> >> >>>> of
>> >> >>>> the virtual counter for the clocksource.
>> >> >>> There are several cases where virtual is unusable -- in particular it
>> >> >>> might not have been configured properly (i.e. the phys/virt offset is
>> >> >>> at a bad value).
>> >> >>>
>> >> >>
>> >> >> Any specifics? It would be nice to say so in the commit text so that
>> >> >> others using such devices know they need this patch. I'm guessing the
>> >> >> firmware can't be fixed?
>> >>
>> >> Even if we could change things to use a virtual timer in some cases,
>> >> Sonny's patch still fixes a bug.  The code as written right now makes
>> >> pretenses about supporting the physical timer, but it doesn't work.
>> >> That should be fixed.
>> >
>> > The code does support the physical timer. It does not support the
>> > physical counter (and makes no pretenses that it does).
>> >
>>
>> Is there some reason that it should not support it?  It seems like the
>> two things are highly related.
>
> While the two are related, in sane systems the use of the physical
> counters is rendered unnecessary by the ability to write to CNTVOFF at
> PL2.

By sane, do you mean a system which starts the kernel in PL2?  Or one
that has CNTVOFF initialized to the same value on all CPUs?

>
> If an OS is booted at PL1 the physical timers aren't guaranteed to be
> accessible, so the OS must use the virtual timers. As the OS could be
> virtualized it must use the virtual counters.

I was curious to learn more about these modes and looked at the spec.
The spec I have seems to say that in a VMSA implementation without
virtualization, then CNTPCT is always available, but if it has
virtualization then a bit needs to be set, which I think is what
you're referring to.  I think the spec also says that virtualization
extensions are optional.  How do you deal with the case that they are
not implemented?  Or maybe that simply isn't supported?

> If an OS is booted at PL2 it can access the physical counters, and
> should do so in case something like KVM will be used later. The OS can
> write to CNTVOFF at PL2, and if it sets CNTVOFF to zero the physical and
> virtual counters are equivalent. Thus it can use the virtual counters
> and doesn't need to have additional code in several places (including
> the VDSO) where it needs to choose to read which counters to read.
>
> The problem only exists where PL2 exists and the firmware/bootloader
> skipped PL2 without initialising the necessary PL2 state. This is in
> general a stupid thing to do; it introduces a problem that need not
> exist and throws away the option of using the features PL2 provides.
> This is a firmware/bootloader bug.

Well it's not quite that simple, this is actually an issue with the
hardware that the CNTVOFF comes up with different values on different
cores.  This happens not only at boot, but any time the core is
powered on, which could include deep sleep or CPU hotplug and suspend
to ram.  The firmware may not be involved in all these cases, so we
cannot rely on it to fix this problem.

>
>> > I had hoped we wouldn't encounter cases where CNTVOFF was hopelessly
>> > ill-configured on a platform, but evidently we have. So we need some
>> > workaround for that.
>> >
>> >> > Yeah, there are a few. The big.LITTLE on the Chromebook 2 models have
>> >> > this issue, due to the A7 cluster having an incorrect offset
>> >> > prog

Re: [PATCH] clocksource: arch_timer: Fix code to use physical timers when requested

2014-08-28 Thread Sonny Rao

On Thu, Aug 28, 2014 at 2:35 AM, Mark Rutland  wrote:
> On Thu, Aug 28, 2014 at 04:33:31AM +0100, Doug Anderson wrote:
>> Hi,
>>
>> On Wed, Aug 27, 2014 at 7:58 PM, Olof Johansson  wrote:
>> > On Wed, Aug 27, 2014 at 5:56 PM, Stephen Boyd  wrote:
>> >> On 08/27/14 15:33, Olof Johansson wrote:
>> >>> On Wed, Aug 27, 2014 at 3:26 PM, Stephen Boyd  
>> >>> wrote:
>> >>>
>>  Is there any reason why the virtual counter can't be read? Maybe we're
>>  the hyp and we need to make sure we don't use the virtual timer so that
>>  the guest can use it, but that doesn't have any effect on the usage of
>>  the virtual counter for the clocksource.
>> >>> There are several cases where virtual is unusable -- in particular it
>> >>> might not have been configured properly (i.e. the phys/virt offset is
>> >>> at a bad value).
>> >>>
>> >>
>> >> Any specifics? It would be nice to say so in the commit text so that
>> >> others using such devices know they need this patch. I'm guessing the
>> >> firmware can't be fixed?
>>
>> Even if we could change things to use a virtual timer in some cases,
>> Sonny's patch still fixes a bug.  The code as written right now makes
>> pretenses about supporting the physical timer, but it doesn't work.
>> That should be fixed.
>
> The code does support the physical timer. It does not support the
> physical counter (and makes no pretenses that it does).
>

Is there some reason that it should not support it?  It seems like the
two things are highly related.

> I had hoped we wouldn't encounter cases where CNTVOFF was hopelessly
> ill-configured on a platform, but evidently we have. So we need some
> workaround for that.
>
>> > Yeah, there are a few. The big.LITTLE on the Chromebook 2 models have
>> > this issue, due to the A7 cluster having an incorrect offset
>> > programmed. However, arch timers aren't supported on that SoC in the
>> > first place, so it's not a problem in reality.
>> >
>> > The other known platform is rk3288. It has products out in the wild
>> > where firmware updates are unlikely.
>>
>> One other reason is that (I'm told) that the virtual offset is lost in
>> certain power down conditions (powering down a core, going into S3,
>> etc).  When we power back up the offset is effectively reset to a
>> random value.  That means we need something to reprogram the virtual
>> timer offset whenever we power things back up.
>>
>> If we've got a hypervisor then the hypervisor will definitely be
>> involved in powering things back up and it can reset the virtual
>> offset.  ...but forcing systems to implement a hypervisor (or somehow
>> adding an interface for the kernel to call back into firmware) is a
>> huge effort and it means more hard-to-update code sitting in firmware.
>
> Not if you boot Linux at hyp, as we've recommended for this precise
> reason. That doesn't fix other things like CNTFRQ if the secure
> initialisation doesn't poke that, however.

That's interesting, we could look into that.

> Thanks,
> Mark.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] clocksource: arch_timer: Fix code to use physical timers when requested

2014-08-27 Thread Sonny Rao

On Wed, Aug 27, 2014 at 2:19 PM, Olof Johansson  wrote:
> On Wed, Aug 27, 2014 at 2:03 PM, Sonny Rao  wrote:
>> This is a bug fix for using physical arch timers when
>> the arch_timer_use_virtual boolean is false.  It restores the
>> arch_counter_get_cntpct() function after removal in
>>
>> 0d651e4e "clocksource: arch_timer: use virtual counters"
>>
>> and completes the implementation of memory mapped access for physical
>> timers, so if a system is trying to use physical timers, it will
>> function properly.
>>
>> Signed-off-by: Sonny Rao 
>
> Acked-by: Olof Johansson 
>
> This should have a:
>
> Fixes: 0d651e4e65e9 ("clocksource: arch_timer: use virtual counters")
>
> tag too, and possibly cc stable?

Ok, as far as stable goes, this patch wouldn't apply cleanly going all
the way back to  0d651e4e65e9
As-is, it would need to go after 220069945b29 "clocksource:
arch_timer: Add support for memory mapped timers" and there would need
to be another, simpler, version that went between those two commits.

So, I'm not sure what to do in this situation regarding stable?

>
>
> -Olof
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] clocksource: arch_timer: Fix code to use physical timers when requested

2014-08-27 Thread Sonny Rao

This is a bug fix for using physical arch timers when
the arch_timer_use_virtual boolean is false.  It restores the
arch_counter_get_cntpct() function after removal in

0d651e4e "clocksource: arch_timer: use virtual counters"

and completes the implementation of memory mapped access for physical
timers, so if a system is trying to use physical timers, it will
function properly.

Signed-off-by: Sonny Rao 
---
 arch/arm/include/asm/arch_timer.h|  9 +
 arch/arm64/include/asm/arch_timer.h  | 10 ++
 drivers/clocksource/arm_arch_timer.c | 30 ++
 3 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/arch/arm/include/asm/arch_timer.h 
b/arch/arm/include/asm/arch_timer.h
index 0704e0c..e72aa4d 100644
--- a/arch/arm/include/asm/arch_timer.h
+++ b/arch/arm/include/asm/arch_timer.h
@@ -78,6 +78,15 @@ static inline u32 arch_timer_get_cntfrq(void)
return val;
 }
 
+static inline u64 arch_counter_get_cntpct(void)
+{
+   u64 cval;
+
+   isb();
+   asm volatile("mrrc p15, 0, %Q0, %R0, c14" : "=r" (cval));
+   return cval;
+}
+
 static inline u64 arch_counter_get_cntvct(void)
 {
u64 cval;
diff --git a/arch/arm64/include/asm/arch_timer.h 
b/arch/arm64/include/asm/arch_timer.h
index 9400596..58657c4 100644
--- a/arch/arm64/include/asm/arch_timer.h
+++ b/arch/arm64/include/asm/arch_timer.h
@@ -135,6 +135,16 @@ static inline void arch_timer_evtstrm_enable(int divider)
 #endif
 }
 
+static inline u64 arch_counter_get_cntpct(void)
+{
+   u64 cval;
+
+   isb();
+   asm volatile("mrs %0, cntpct_el0" : "=r" (cval));
+
+   return cval;
+}
+
 static inline u64 arch_counter_get_cntvct(void)
 {
u64 cval;
diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index 5163ec1..ad723cb 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -30,6 +30,8 @@
 #define CNTTIDR0x08
 #define CNTTIDR_VIRT(n)(BIT(1) << ((n) * 4))
 
+#define CNTPCT_LO  0x00
+#define CNTPCT_HI  0x04
 #define CNTVCT_LO  0x08
 #define CNTVCT_HI  0x0c
 #define CNTFRQ 0x10
@@ -386,6 +388,19 @@ static u64 arch_counter_get_cntvct_mem(void)
return ((u64) vct_hi << 32) | vct_lo;
 }
 
+static u64 arch_counter_get_cntpct_mem(void)
+{
+   u32 pct_lo, pct_hi, tmp_hi;
+
+   do {
+   pct_hi = readl_relaxed(arch_counter_base + CNTPCT_HI);
+   pct_lo = readl_relaxed(arch_counter_base + CNTPCT_LO);
+   tmp_hi = readl_relaxed(arch_counter_base + CNTPCT_HI);
+   } while (pct_hi != tmp_hi);
+
+   return ((u64) pct_hi << 32) | pct_lo;
+}
+
 /*
  * Default to cp15 based access because arm64 uses this function for
  * sched_clock() before DT is probed and the cp15 method is guaranteed
@@ -429,10 +444,17 @@ static void __init arch_counter_register(unsigned type)
u64 start_count;
 
/* Register the CP15 based counter if we have one */
-   if (type & ARCH_CP15_TIMER)
-   arch_timer_read_counter = arch_counter_get_cntvct;
-   else
-   arch_timer_read_counter = arch_counter_get_cntvct_mem;
+   if (type & ARCH_CP15_TIMER) {
+   if (arch_timer_use_virtual)
+   arch_timer_read_counter = arch_counter_get_cntvct;
+   else
+   arch_timer_read_counter = arch_counter_get_cntpct;
+   } else {
+   if (arch_timer_use_virtual)
+   arch_timer_read_counter = arch_counter_get_cntvct_mem;
+   else
+   arch_timer_read_counter = arch_counter_get_cntpct_mem;
+   }
 
start_count = arch_timer_read_counter();
clocksource_register_hz(&clocksource_counter, arch_timer_rate);
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/4] ARM: rockchip: rk3288: Switch to use the proper PWM IP

2014-08-18 Thread Sonny Rao

On Mon, Aug 18, 2014 at 10:09 AM, Doug Anderson  wrote:
> The rk3288 SoC has an option to switch all of the PWMs in the system
> between the old IP block and the new IP block.  The new IP block is
> working and tested and the suggested PWM to use, so setup the SoC to
> use it and then we can pretend that the other IP block doesn't exist.
>
> This code could go lots of other places, but we've put it here.  Why?
> - Pushing it to the bootloader just makes the code harder to update in
>   the field.  If we later find a bug in the new IP block and want to
>   change our mind about what to use we want it to be easy to update.
> - Putting this code in the driver for IP block is a lot of extra work,
>   device tree bindings, etc.  Now that the new IP block is validated
>   it's likely no future SoCs will need this code.  Why pollute the PWM
>   driver with this?  This is an rk3288 thing so it should be in rk3288
>   code.
> - There's a single bit that switches over PWMs, which makes it extra
>   hard to put this under the PWM device tree nodes.
>
> Signed-off-by: Doug Anderson 
> ---
>  arch/arm/mach-rockchip/rockchip.c | 19 +++
>  1 file changed, 19 insertions(+)
>
> diff --git a/arch/arm/mach-rockchip/rockchip.c 
> b/arch/arm/mach-rockchip/rockchip.c
> index 8ab9e0e..99133b9 100644
> --- a/arch/arm/mach-rockchip/rockchip.c
> +++ b/arch/arm/mach-rockchip/rockchip.c
> @@ -24,6 +24,24 @@
>  #include 
>  #include "core.h"
>
> +static void __init rk3288_init_machine(void)
> +{
> +   void *grf = ioremap(0xff77, 0x1);

Is it worth checking for failure here?  Will the system boot without this?

> +
> +   /* Set pwm_sel to RK design PWM in GRF_SOC_CON2; affects all PWMs */
> +   writel(0x00010001, grf + 0x24c);
> +
> +   iounmap(grf);
> +}
> +
> +static void __init rockchip_init_machine(void)
> +{
> +   if (of_machine_is_compatible("rockchip,rk3288"))
> +   rk3288_init_machine();
> +
> +   of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL);
> +}
> +
>  static const char * const rockchip_board_dt_compat[] = {
> "rockchip,rk2928",
> "rockchip,rk3066a",
> @@ -34,6 +52,7 @@ static const char * const rockchip_board_dt_compat[] = {
>  };
>
>  DT_MACHINE_START(ROCKCHIP_DT, "Rockchip Cortex-A9 (Device Tree)")
> +   .init_machine   = rockchip_init_machine,
> .l2c_aux_val= 0,
> .l2c_aux_mask   = ~0,
> .dt_compat  = rockchip_board_dt_compat,
> --
> 2.1.0.rc2.206.gedb03e5
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 1/4] ARM: dts: Add emmc and sdmmc to the rk3288 device tree

2014-08-12 Thread Sonny Rao

On Tue, Aug 12, 2014 at 3:07 PM, Heiko Stübner  wrote:
> Am Dienstag, 12. August 2014, 14:06:11 schrieb Doug Anderson:
>> Heiko,
>>
>> On Wed, Aug 6, 2014 at 10:09 AM, Doug Anderson 
> wrote:
>> > This adds support for the sdmmc and emmc ports on the rk3288 using the
>> >
>> > currently posted driver from Addy at:
>> >   https://patchwork.kernel.org/patch/4653631/
>> >
>> > Note:
>> > * This is not baesd on Jaehoon's patch series removing the slot node,
>> >
>> >   but it does use new syntax like putting the bus width at the top
>> >   level and using the new cap-mmc-highspeed / cap-sd-highspeed.  A
>> >   future patch will modify this one to remove the slot node.
>> >
>> > Signed-off-by: Doug Anderson 
>> > Acked-by: Arnd Bergmann 
>> > ---
>> > Changes in v3: None
>> > Changes in v2:
>> > - New patchwork link for Addy's patch
>> >
>> >  arch/arm/boot/dts/rk3288.dtsi | 22 ++
>> >  1 file changed, 22 insertions(+)
>> >
>> > diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi
>> > index e7cb008..dc98a5b 100644
>> > --- a/arch/arm/boot/dts/rk3288.dtsi
>> > +++ b/arch/arm/boot/dts/rk3288.dtsi
>> > @@ -78,6 +78,28 @@
>> >
>> > clock-frequency = <2400>;
>> >
>> > };
>> >
>> > +   sdmmc: dwmmc@ff0c {
>> > +   compatible = "rockchip,rk3288-dw-mshc";
>> > +   clocks = <&cru HCLK_SDMMC>, <&cru SCLK_SDMMC>;
>> > +   clock-names = "biu", "ciu";
>> > +   fifo-depth = <0x100>;
>> > +   interrupts = ;
>> > +   reg = <0xff0c 0x4000>;
>> > +   #address-cells = <1>;
>> > +   #size-cells = <0>;
>>
>> When doing other testing I realized that I missed a:
>>   status = "disabled";
>>
>> ...from both of these two nodes.  I'm happy to repost with this fix or
>> I'm happy if you want to add to the patch when applying.
>>
>> Let me know.  Thanks!
>
> I don't really have a preference :-) . Btw. I also did plan on merging patches
> 1 +4 and 2+3 now that the slot-removal series has landed.
> Would this be ok with you?
>
> So if you want to repost, you could do this as two patches already :-).

Please repost with the status = "disabled"; the fact that they were
enabled by the dtsi was confusing to some of us at least once already
:-)

>
>
> Heiko
>
>>
>> > +   };
>> > +
>> > +   emmc: dwmmc@ff0f {
>> > +   compatible = "rockchip,rk3288-dw-mshc";
>> > +   clocks = <&cru HCLK_EMMC>, <&cru SCLK_EMMC>;
>> > +   clock-names = "biu", "ciu";
>> > +   fifo-depth = <0x100>;
>> > +   interrupts = ;
>> > +   reg = <0xff0f 0x4000>;
>> > +   #address-cells = <1>;
>> > +   #size-cells = <0>;
>> > +   };
>> > +
>> >
>> > i2c1: i2c@ff14 {
>> >
>> > compatible = "rockchip,rk3288-i2c";
>> > reg = <0xff14 0x1000>;
>> >
>> > --
>> > 2.0.0.526.g5318336
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] mmc: dw_mmc: change to use recommended reset procedure

2014-08-04 Thread Sonny Rao

This patch changes the fifo reset code to follow the reset procedure
outlined in the documentation of Synopsys Mobile storage host databook.

Signed-off-by: Sonny Rao 
Signed-off-by: Yuvaraj Kumar C D 
Acked-by: Seungwon Jeon 
Signed-off-by: Ulf Hansson 
[sonnyrao: fix compile for !CONFIG_MMC_DW_IDMAC case]
---
 drivers/mmc/host/dw_mmc.c | 87 ++-
 drivers/mmc/host/dw_mmc.h |  5 +++
 2 files changed, 69 insertions(+), 23 deletions(-)

diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index 1ac227c..39cf54f 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -111,8 +111,7 @@ static const u8 tuning_blk_pattern_8bit[] = {
0xff, 0x77, 0x77, 0xff, 0x77, 0xbb, 0xdd, 0xee,
 };
 
-static inline bool dw_mci_fifo_reset(struct dw_mci *host);
-static inline bool dw_mci_ctrl_all_reset(struct dw_mci *host);
+static bool dw_mci_reset(struct dw_mci *host);
 
 #if defined(CONFIG_DEBUG_FS)
 static int dw_mci_req_show(struct seq_file *s, void *v)
@@ -1235,7 +1234,7 @@ static int dw_mci_data_complete(struct dw_mci *host, 
struct mmc_data *data)
 * After an error, there may be data lingering
 * in the FIFO
 */
-   dw_mci_fifo_reset(host);
+   dw_mci_reset(host);
} else {
data->bytes_xfered = data->blocks * data->blksz;
data->error = 0;
@@ -1352,7 +1351,7 @@ static void dw_mci_tasklet_func(unsigned long priv)
 
/* CMD error in data command */
if (mrq->cmd->error && mrq->data)
-   dw_mci_fifo_reset(host);
+   dw_mci_reset(host);
 
host->cmd = NULL;
host->data = NULL;
@@ -1963,14 +1962,8 @@ static void dw_mci_work_routine_card(struct work_struct 
*work)
}
 
/* Power down slot */
-   if (present == 0) {
-   /* Clear down the FIFO */
-   dw_mci_fifo_reset(host);
-#ifdef CONFIG_MMC_DW_IDMAC
-   dw_mci_idmac_reset(host);
-#endif
-
-   }
+   if (present == 0)
+   dw_mci_reset(host);
 
spin_unlock_bh(&host->lock);
 
@@ -2208,8 +2201,11 @@ static bool dw_mci_ctrl_reset(struct dw_mci *host, u32 
reset)
return false;
 }
 
-static inline bool dw_mci_fifo_reset(struct dw_mci *host)
+static bool dw_mci_reset(struct dw_mci *host)
 {
+   u32 flags = SDMMC_CTRL_RESET | SDMMC_CTRL_FIFO_RESET;
+   bool ret = false;
+
/*
 * Reseting generates a block interrupt, hence setting
 * the scatter-gather pointer to NULL.
@@ -2219,15 +2215,60 @@ static inline bool dw_mci_fifo_reset(struct dw_mci 
*host)
host->sg = NULL;
}
 
-   return dw_mci_ctrl_reset(host, SDMMC_CTRL_FIFO_RESET);
-}
+   if (host->use_dma)
+   flags |= SDMMC_CTRL_DMA_RESET;
 
-static inline bool dw_mci_ctrl_all_reset(struct dw_mci *host)
-{
-   return dw_mci_ctrl_reset(host,
-SDMMC_CTRL_FIFO_RESET |
-SDMMC_CTRL_RESET |
-SDMMC_CTRL_DMA_RESET);
+   if (dw_mci_ctrl_reset(host, flags)) {
+   /*
+* In all cases we clear the RAWINTS register to clear any
+* interrupts.
+*/
+   mci_writel(host, RINTSTS, 0x);
+
+   /* if using dma we wait for dma_req to clear */
+   if (host->use_dma) {
+   unsigned long timeout = jiffies + msecs_to_jiffies(500);
+   u32 status;
+   do {
+   status = mci_readl(host, STATUS);
+   if (!(status & SDMMC_STATUS_DMA_REQ))
+   break;
+   cpu_relax();
+   } while (time_before(jiffies, timeout));
+
+   if (status & SDMMC_STATUS_DMA_REQ) {
+   dev_err(host->dev,
+   "%s: Timeout waiting for dma_req to "
+   "clear during reset\n", __func__);
+   goto ciu_out;
+   }
+
+   /* when using DMA next we reset the fifo again */
+   if (!dw_mci_ctrl_reset(host, SDMMC_CTRL_FIFO_RESET))
+   goto ciu_out;
+   }
+   } else {
+   /* if the controller reset bit did clear, then set clock regs */
+   if (!(mci_readl(host, CTRL) & SDMMC_CTR

Re: linux-next: build failure after merge of the mmc-uh tree

2014-07-28 Thread Sonny Rao

On Sun, Jul 27, 2014 at 9:46 PM, Stephen Rothwell  wrote:
> Hi Ulf,
>
> After merging the mmc-uh tree, today's linux-next build (arm
> multi_v7_defconfig) failed like this:
>
> drivers/mmc/host/dw_mmc.c: In function 'dw_mci_reset':
> drivers/mmc/host/dw_mmc.c:2262:3: error: implicit declaration of function 
> 'dw_mci_idmac_reset' [-Werror=implicit-function-declaration]
>dw_mci_idmac_reset(host);
>^

Hi, sorry about that.  It looks like it fails to build when
CONFIG_MMC_DW_IDMAC is not set.
I changed that bit of code from using #ifdef to using just C if
statement, but I think in this case the function being called doesn't
exist when CONFIG_MMC_DW_IDMAC is not set, so that was incorrect and
we should go back to using something like:

#if IS_ENABLED(CONFIG_MMC_DW_IDMAC)
/* It is also recommended that we reset and reprogram idmac */
dw_mci_idmac_reset(host);
#endif


Ulf, I can respin the patch if you'd like or feel free to fix it
yourself too.  Thanks.


>
> Caused by commit 25f7dadbd982 ("mmc: dw_mmc: change to use recommended
> reset procedure").
>
> I have used the mmc-uh tree from next-20140725 for today.
> --
> Cheers,
> Stephen Rothwells...@canb.auug.org.au
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: CPU performance counters not working on big.LITTLE switcher

2014-05-05 Thread Sonny Rao

On Mon, May 5, 2014 at 7:52 PM, Nicolas Pitre  wrote:
> On Mon, 5 May 2014, Sonny Rao wrote:
>
>> Hi, we have the problem today that cpu based performance counters don't
>> work when we're using the big.LITTLE switcher on Exynos 5420, and it
>> doesn't look like code exists to deal with this in the switcher.
>>
>> As it stands right now, if you put an A-15 or A-7 PMU node into your
>> device-tree on an bl_switcher system it's very broken.  At the minimum, I
>> think it should disable performance counters until there's some kind of
>> proper implementation.
>>
>> I looked into trying to make this work, but it turned out to not be as
>> simple as just context switching counters from A-15 to A-7.  The biggest
>> problem is that the PMUs are not architecturally compatible.  There are
>> different events and differing numbers of counters on these two cores.
>>  There's also the tangential issue of representing this in the device tree,
>> but that's far less important.
>>
>> My guess as to how to fix this is to create an "architectural" PMU which
>> contains the intersection of the two performance monitor units with the
>> minimum number of counters supported by either core (which in this case
>> looks to be 4 on the A7).  However, I don't really have the bandwidth to
>> work on this at  the moment.  I was mostly wondering, have other people run
>> into this limitation and is there any sort of plan to work on it?
>
> The Linaro kernel release from a year ago or so contained a hack to make
> PMUs available and cope with the switcher.

Ok, any pointers?  Like I mentioned, if one enables the A15 Counters
with an upstream kernel that's using the switcher, I think things are
very broken, and since the switcher code is upstream, it seems like at
a minimum it would be good to deal with that somehow.  The big hammer
would be just to make hardware PMU support incompatible with the
switcher support, but maybe there are better solutions.

> However, the ultimate solution is to add multi-PMU support in a generic
> way to the kernel and let user space see both A15 and A7 counters.  It
> is then up to the analysis tools to consolidate (some of) them if
> wanted.  And this would work whether the switcher is used, or even when
> the scheduler has learned to do proper task placement for b.L systems
> where tasks may migrate across clusters.

How is that meant to work?  I think you'd need the generic perf-event
subsystem to properly support multiple CPU-type PMUs, which it
currently does not.  In the case of a system using the switcher, would
the events on a particular logical "cpu" just get inter-mingled from
the different cores?  I think it would be difficult to make sense of
data like that without extra information about when the logical cpu
switched from one type to the other.

> Someone at ARM indicated they'd be working on the multi-PMU support if I
> remember correctly.  For that reason, Linaro stopped maintaining the
> initial hack since it was a lot of work to keep it working on top of
> later kernels and a better solution was coming anyway.  I don't know
> what the status of that work is though.
>
>
> Nicolas
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: CPU performance counters not working on big.LITTLE switcher

2014-05-05 Thread Sonny Rao

[sorry for HTML spam, resending]

Hi, we have the problem today that cpu based performance counters don't work
when we're using the big.LITTLE switcher on Exynos 5420, and it doesn't look
like code exists to deal with this in the switcher.

As it stands right now, if you put an A-15 or A-7 PMU node into your
device-tree on an bl_switcher system it's very broken.  At the minimum, I
think it should disable performance counters until there's some kind of
proper implementation.

I looked into trying to make this work, but it turned out to not be as
simple as just context switching counters from A-15 to A-7.  The biggest
problem is that the PMUs are not architecturally compatible.  There are
different events and differing numbers of counters on these two cores.
There's also the tangential issue of representing this in the device tree,
but that's far less important.

My guess as to how to fix this is to create an "architectural" PMU which
contains the intersection of the two performance monitor units with the
minimum number of counters supported by either core (which in this case
looks to be 4 on the A7).  However, I don't really have the bandwidth to
work on this at  the moment.  I was mostly wondering, have other people run
into this limitation and is there any sort of plan to work on it?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC PATCH] drivers: char: Add a dynamic clock for the trace clock

2013-12-11 Thread Sonny Rao

On Wed, Dec 11, 2013 at 5:49 PM, Steven Rostedt  wrote:
> On Wed, 11 Dec 2013 17:17:30 -0800
> Sonny Rao  wrote:
>
>> On Wed, Dec 11, 2013 at 11:30 AM, Stephane Eranian  
>> wrote:
>> > Sonny,
>> >
>> > Your patch has a couple of problems for me:
>> >  - requires CONFIG_TRACING
>> >
>> > You should directly invoke getrawmonotonic()
>> >  and inline the code from trace_clock_getres().
>> >
>> > That's how I managed to compile your kernel module on my system.
>>
>> You need the changes in kernel/trace/trace.c which is why it's
>> dependent on CONFIG_TRACING.
>> If we put those functions elsewhere we could remove that dependency,
>> but it sounds like people want to just fix the clock that perf uses so
>> that it's exportable and not handle this with something like this
>> patch, which is better.
>
> I have no issue moving the trace_clock.c code into lib/ and we can add
> a CONFIG_TRACE_CLOCK option that can be set by perf and ftrace.
>

That sounds like a good idea to me, regardless of what we end up doing.

>>
>> Also, we should ensure that perf and ftrace are guaranteed to use the
>> same clock, I think it just happens to be the same right now.
>
> ftrace has several clocks that it uses:
>
> o local - basically sched_clock()
> o global - something like hpet that is monotonic across CPUs but slower
> o counter - a simple atomic counter (no time associated to it)
> o uptime - jiffy counter
> o perf  - trace_clock, which is what perf uses
> o x86_tsc - the raw tsc counter.
>
> # cat /sys/kernel/debug/trace_clock
> [local] global counter uptime perf x86-tsc
>

Ah ok sorry for the incorrect info there, thanks for clarifying.
So, If I wanted to make sure everything is synced up between ftrace
events and perf events I should say perf here instead of local.

> -- Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC PATCH] drivers: char: Add a dynamic clock for the trace clock

2013-12-11 Thread Sonny Rao

On Wed, Dec 11, 2013 at 11:30 AM, Stephane Eranian  wrote:
> Sonny,
>
> Your patch has a couple of problems for me:
>  - requires CONFIG_TRACING
>
> You should directly invoke getrawmonotonic()
>  and inline the code from trace_clock_getres().
>
> That's how I managed to compile your kernel module on my system.

You need the changes in kernel/trace/trace.c which is why it's
dependent on CONFIG_TRACING.
If we put those functions elsewhere we could remove that dependency,
but it sounds like people want to just fix the clock that perf uses so
that it's exportable and not handle this with something like this
patch, which is better.

Also, we should ensure that perf and ftrace are guaranteed to use the
same clock, I think it just happens to be the same right now.

>
>
> On Mon, Dec 9, 2013 at 8:49 AM, Josh Triplett  wrote:
>> On Fri, Dec 06, 2013 at 04:34:11PM -0800, Sonny Rao wrote:
>>> Based on a suggestion from John Stultz.
>>>
>>> This adds a dynamic clock device which can be used with clock_gettime
>>> to sample the clock source used for time stamping trace events in the
>>> kernel.  The only use for this clock source is to associate user space
>>> events with kernel events on a given kernel.  It is explicitly not
>>> supposed to be used as a generic time source and won't necessarily be
>>> consistent between kernels.
>>>
>>> Signed-off-by: Sonny Rao 
>>
>> Reviewed-by: Josh Triplett 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC PATCH] drivers: char: Add a dynamic clock for the trace clock

2013-12-06 Thread Sonny Rao

Based on a suggestion from John Stultz.

This adds a dynamic clock device which can be used with clock_gettime
to sample the clock source used for time stamping trace events in the
kernel.  The only use for this clock source is to associate user space
events with kernel events on a given kernel.  It is explicitly not
supposed to be used as a generic time source and won't necessarily be
consistent between kernels.

Signed-off-by: Sonny Rao 
---
 drivers/char/Kconfig   |  8 
 drivers/char/Makefile  |  1 +
 drivers/char/trace_clock.c | 99 ++
 include/linux/kernel.h | 12 ++
 kernel/trace/trace.c   | 23 +++
 5 files changed, 143 insertions(+)
 create mode 100644 drivers/char/trace_clock.c

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index fa3243d..785ab73 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -599,5 +599,13 @@ config TILE_SROM
  device appear much like a simple EEPROM, and knows
  how to partition a single ROM for multiple purposes.
 
+config TRACE_CLOCK_DEV
+   tristate "Dynamic clock type which gives time used for ftrace events"
+   depends on TRACING
+   default y
+   help
+ This device presents a posix dynamic clock which allows user
+ space to sample the clock used for time stamps on trace events
+ in the kernel.
 endmenu
 
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index a324f93..5cd42e0 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -61,3 +61,4 @@ obj-$(CONFIG_JS_RTC)  += js-rtc.o
 js-rtc-y = rtc.o
 
 obj-$(CONFIG_TILE_SROM)+= tile-srom.o
+obj-$(CONFIG_TRACE_CLOCK_DEV)  += trace_clock.o
diff --git a/drivers/char/trace_clock.c b/drivers/char/trace_clock.c
new file mode 100644
index 000..d73b35d
--- /dev/null
+++ b/drivers/char/trace_clock.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2013 The Chromium OS Authors 
+ *All Rights Reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Posix Dynamic Clock for tracing clock.
+ *
+ * This device is meant to provide a stream which userspace can sample
+ * to match up kernel generated events to user generated events, on a
+ * given kernel.  It  * is explicitly *not* trying to be a standalone
+ * time source and shouldn't be used for anything else or for
+ * comparisons between different kernels.
+ *
+ */
+#include 
+#include 
+#include 
+
+static dev_t tc_devt;
+static struct class *tc_class;
+static struct device *tc_dev;
+static struct posix_clock tc_pclock;
+
+static int tc_clock_gettime(struct posix_clock *pc, struct timespec *ts)
+{
+   trace_clock_gettime(ts);
+   return 0;
+}
+
+static int tc_clock_getres(struct posix_clock *pc, struct timespec *ts)
+{
+   trace_clock_getres(ts);
+   return 0;
+}
+
+static struct posix_clock_operations tc_clock_ops = {
+   .owner  = THIS_MODULE,
+   .clock_gettime  = tc_clock_gettime,
+   .clock_getres   = tc_clock_getres,
+};
+
+static void __exit trace_clock_exit(void)
+{
+   posix_clock_unregister(&tc_pclock);
+   device_destroy(tc_class, tc_devt);
+   unregister_chrdev_region(tc_devt, 1);
+   class_destroy(tc_class);
+}
+
+static int __init trace_clock_init(void)
+{
+   int err = -ENODEV;
+
+   tc_class = class_create(THIS_MODULE, "trace_clock");
+   if (IS_ERR(tc_class)) {
+   pr_err("trace_clock: failed to allocate class\n");
+   return PTR_ERR(tc_class);
+   }
+
+   err = alloc_chrdev_region(&tc_devt, 0, 1, "trace_clock");
+   if (err < 0) {
+   pr_err("trace_clock: failed to allocate device region\n");
+   goto no_region;
+   }
+   tc_dev = device_create(tc_class, NULL, tc_devt, NULL, "trace_clock");
+   if (!tc_dev) {
+   pr_err("trace_clock: failed to create device\n");
+   goto no_device;
+   }
+
+   tc_pclock.ops = tc_clock_ops;
+
+   err = posix_clock_register(&tc_pclock, tc_devt);
+   if (err < 0) {
+   pr_err("trace_clock: failed to register posix clock %d\n",
+  err);
+   goto no_pclock;
+   }
+   pr_info("Trace clock support registered\n");
+   return 0;
+
+no_pclock:
+   device_destroy(tc_class, tc_devt);
+
+no_device:
+   unregister_chrdev_region(tc_devt, 1);
+
+no_region:
+   class_destroy(tc_class);
+   return err;
+}
+
+subsys_initcall(trace_clock_init);
+module_exit(trace_clock_exit);
+
+MODULE_AUTHOR("Sonny Rao ");
+MODULE_DESCRIPTION("Trace clock device support");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2ac0277..68a922a 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -51

Re: [PATCH] perf session: Fix infinite loop on invalid perf.data file

2013-09-30 Thread Sonny Rao

On Mon, Sep 30, 2013 at 6:49 AM, David Ahern  wrote:
> On 9/30/13 2:19 AM, Namhyung Kim wrote:
>>
>> From: Namhyung Kim 
>>
>> perf-record updates the header in the perf.data file at termination.
>> Without this update perf-report (and other processing built-ins) it
>> caused an infinite loop when perf report (or something like) called.
>>
>> This is because the algorithm in __perf_session__process_events()
>> depends on the data_size which is read from file header.  Use file
>> size directly instead in this case to do the best-effort processing.
>>
>> Cc: David Ahern 
>> Cc: Sonny Rao 
>> Signed-off-by: David Ahern 
>> Signed-off-by: Namhyung Kim 
>
>
> worked ok for me. Sonny can you verify?

Yes, it works for me as well, thanks!


> David
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] perf: fix infinite loop with corrupted header

2013-09-26 Thread Sonny Rao

On Thu, Sep 26, 2013 at 6:34 AM, David Ahern  wrote:
> On 9/25/13 11:20 PM, Sonny Rao wrote:
>>
>> We recently ran into a corrupt perf data file which mostly looked okay
>> but the section size for data was set to 0.  This caused perf report to
>> get into an infinite loop in __perf_session_process_events().  Let's
>> just avoid this by bailing early and reporting it if there's an
>> invalid header.
>
>
> Been suggested before:
> https://lkml.org/lkml/2013/5/9/405
>
> Other changes went in around that time as well. Are you still seeing the
> loop on latest source?

I'm still seeing it on 3.12-rc1.  I haven't tested anything newer or
in the perf git tree.
Thanks

>
> David
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] perf: fix infinite loop with corrupted header

2013-09-25 Thread Sonny Rao

We recently ran into a corrupt perf data file which mostly looked okay
but the section size for data was set to 0.  This caused perf report to
get into an infinite loop in __perf_session_process_events().  Let's
just avoid this by bailing early and reporting it if there's an
invalid header.

Signed-off-by: Sonny Rao 
---
 tools/perf/util/header.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 26441d0..085ef76 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -2582,6 +2582,10 @@ int perf_file_header__read(struct perf_file_header 
*header,
ph->data_offset  = header->data.offset;
ph->data_size= header->data.size;
ph->feat_offset  = header->data.offset + header->data.size;
+
+   if (!header->data.size)
+   die("corrupted header, invalid size 0 for data section\n");
+
return 0;
 }
 
-- 
1.8.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Fans at full speed after resume

2013-05-15 Thread Sonny Rao

On Tue, May 14, 2013 at 9:56 PM, Sonny Rao  wrote:
> On Tue, May 14, 2013 at 9:34 PM, Sonny Rao  wrote:
>> On Tue, May 14, 2013 at 9:29 PM, Zhang Rui  wrote:
>>> On Wed, 2013-05-15 at 12:26 +0800, Zhang Rui wrote:
>>>> please
>>>>
>>>> On Tue, 2013-05-14 at 21:18 -0700, Sonny Rao wrote:
>>>> > Hi, I've seen a regression in kernels since 3.7 on x86 devices where
>>>> > the kernel turns the system fans on to max speed after resuming from
>>>> > ram.  Other people have noticed it as well, for example see
>>>> > https://bugzilla.redhat.com/show_bug.cgi?id=895276
>>>> >
>>>> please check if this is a duplicate of bug
>>>> https://bugzilla.kernel.org/show_bug.cgi?id=56591
>>> or you can try 3.10-rc1 to see if the problem still exists or not.
>>
>> Ok, I patched in the fix from that bugzilla --
>> 928c5edbe6f7cb0d1c71bc2353d091bc5b114fe3
>> but I'm still seeing the issue, I'll try 3.10-rc1 next
>>
>
> 3.10-rc1 seems good
> 3.9.2 is okay, though fans do seem to be on more for a while after
> resume, it eventually turns off
> 3.8.13 seems to still be broken, with fans at maximum
>

So, I did a reverse bisect between 3.9 and 3.9.1 and found that the
commit you mentioned does indeed fix the problem on 3.9, and I
double-checked that it doesn't seem to be fixed on 3.8.13.  So, I made
a 3.8.13 version of this debug patch in the bugzilla entry
https://bugzilla.kernel.org/attachment.cgi?id=98671

and I never see the thermal_cdev_update getting called for cdev 0 or
cdev 1, yet they are set to 1 after resume.  Perhaps something else is
enabling them?

>>>
>>> thanks,
>>> rui
>>>> > For example on the Samsung 550 Chromebook, we have one thermal zone
>>>> > and have 5 cooling_devices, 0-4, which correspond to 5 possible fan
>>>> > speeds.  Under typical idle, only cooling_device4 and maybe
>>>> > cooling_device3 are active, depending on temperature:
>>>> >
>>>> > cat /sys/class/thermal/cooling_device[01234]/cur_state
>>>> > /sys/class/thermal/thermal_zone0/temp
>>>> > 0
>>>> > 0
>>>> > 0
>>>> > 0
>>>> > 1
>>>> > 57000
>>>> >
>>>> > however after a suspend/resume, we see that cooling_devices 0 and 1
>>>> > become active:
>>>> > cat /sys/class/thermal/cooling_device[01234]/cur_state
>>>> > /sys/class/thermal/thermal_zone0/temp
>>>> > 1
>>>> > 1
>>>> > 0
>>>> > 0
>>>> > 1
>>>> > 54000
>>>> >
>>>> > and it seems to stay that way, even though the temperature is low
>>>> > enough that the fan shouldn't be running at that speed.  If I manually
>>>> > disable cooling_devices 0 and 1 then fan control works normally again.
>>>> >
>>>> > I started bisecting it and was able to do so up until this commit:
>>>> > commit 29b19e250434c6193c8b8e4c34c9c6284dd4f101
>>>> > Merge: 125c4c7 c072fed
>>>> > Author: Len Brown 
>>>> > AuthorDate: Tue Oct 9 01:35:52 2012 -0400
>>>> > Commit: Len Brown 
>>>> > CommitDate: Tue Oct 9 01:35:52 2012 -0400
>>>> >
>>>> > Merge branch 'release' of
>>>> > git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux into
>>>> > thermal
>>>> >
>>>> > unfortunately, I'm not able to successfully do a suspend/resume on the
>>>> > commits in that merge, so I wasn't able to bisect down to the exact
>>>> > commit.
>>>> >
>>>> > I did confirm that one parent of the merge is okay: commit
>>>> > 125c4c706b680c7831f0966ff873c1ad0354ec25 idr: rename MAX_LEVEL to
>>>> > MAX_IDR_LEVEL
>>>> >
>>>> > so I think it falls somewhere in this list of commits:
>>>> > c072fed95c9855a920c114d7fa3351f0f54ea06e...e3f25e6e5836c4790fbe395ff42e241f372d859d
>>>> >
>>>> > c072fed9 thermal: Exynos: Fix NULL pointer dereference in
>>>> > exynos_unregister_thermal()
>>>> > a4b6fec9 Thermal: Fix bug on cpu_cooling, cooling device's id conflict 
>>>> > problem.
>>>> > 79e093c3 thermal: exynos: Use devm_* functions
>>>> > 17be868e ARM: exynos: add thermal sensor driver platform data support
>>>

Re: Fans at full speed after resume

2013-05-14 Thread Sonny Rao

On Tue, May 14, 2013 at 9:34 PM, Sonny Rao  wrote:
> On Tue, May 14, 2013 at 9:29 PM, Zhang Rui  wrote:
>> On Wed, 2013-05-15 at 12:26 +0800, Zhang Rui wrote:
>>> please
>>>
>>> On Tue, 2013-05-14 at 21:18 -0700, Sonny Rao wrote:
>>> > Hi, I've seen a regression in kernels since 3.7 on x86 devices where
>>> > the kernel turns the system fans on to max speed after resuming from
>>> > ram.  Other people have noticed it as well, for example see
>>> > https://bugzilla.redhat.com/show_bug.cgi?id=895276
>>> >
>>> please check if this is a duplicate of bug
>>> https://bugzilla.kernel.org/show_bug.cgi?id=56591
>> or you can try 3.10-rc1 to see if the problem still exists or not.
>
> Ok, I patched in the fix from that bugzilla --
> 928c5edbe6f7cb0d1c71bc2353d091bc5b114fe3
> but I'm still seeing the issue, I'll try 3.10-rc1 next
>

3.10-rc1 seems good
3.9.2 is okay, though fans do seem to be on more for a while after
resume, it eventually turns off
3.8.13 seems to still be broken, with fans at maximum

>>
>> thanks,
>> rui
>>> > For example on the Samsung 550 Chromebook, we have one thermal zone
>>> > and have 5 cooling_devices, 0-4, which correspond to 5 possible fan
>>> > speeds.  Under typical idle, only cooling_device4 and maybe
>>> > cooling_device3 are active, depending on temperature:
>>> >
>>> > cat /sys/class/thermal/cooling_device[01234]/cur_state
>>> > /sys/class/thermal/thermal_zone0/temp
>>> > 0
>>> > 0
>>> > 0
>>> > 0
>>> > 1
>>> > 57000
>>> >
>>> > however after a suspend/resume, we see that cooling_devices 0 and 1
>>> > become active:
>>> > cat /sys/class/thermal/cooling_device[01234]/cur_state
>>> > /sys/class/thermal/thermal_zone0/temp
>>> > 1
>>> > 1
>>> > 0
>>> > 0
>>> > 1
>>> > 54000
>>> >
>>> > and it seems to stay that way, even though the temperature is low
>>> > enough that the fan shouldn't be running at that speed.  If I manually
>>> > disable cooling_devices 0 and 1 then fan control works normally again.
>>> >
>>> > I started bisecting it and was able to do so up until this commit:
>>> > commit 29b19e250434c6193c8b8e4c34c9c6284dd4f101
>>> > Merge: 125c4c7 c072fed
>>> > Author: Len Brown 
>>> > AuthorDate: Tue Oct 9 01:35:52 2012 -0400
>>> > Commit: Len Brown 
>>> > CommitDate: Tue Oct 9 01:35:52 2012 -0400
>>> >
>>> > Merge branch 'release' of
>>> > git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux into
>>> > thermal
>>> >
>>> > unfortunately, I'm not able to successfully do a suspend/resume on the
>>> > commits in that merge, so I wasn't able to bisect down to the exact
>>> > commit.
>>> >
>>> > I did confirm that one parent of the merge is okay: commit
>>> > 125c4c706b680c7831f0966ff873c1ad0354ec25 idr: rename MAX_LEVEL to
>>> > MAX_IDR_LEVEL
>>> >
>>> > so I think it falls somewhere in this list of commits:
>>> > c072fed95c9855a920c114d7fa3351f0f54ea06e...e3f25e6e5836c4790fbe395ff42e241f372d859d
>>> >
>>> > c072fed9 thermal: Exynos: Fix NULL pointer dereference in
>>> > exynos_unregister_thermal()
>>> > a4b6fec9 Thermal: Fix bug on cpu_cooling, cooling device's id conflict 
>>> > problem.
>>> > 79e093c3 thermal: exynos: Use devm_* functions
>>> > 17be868e ARM: exynos: add thermal sensor driver platform data support
>>> > 7e0b55e6 thermal: exynos: register the tmu sensor with the kernel thermal 
>>> > layer
>>> > f22d9c03c thermal: exynos5: add exynos5250 thermal sensor driver support
>>> > c48cbba6 hwmon: exynos4: move thermal sensor driver to driver/thermal 
>>> > directory
>>> > 02361418 thermal: add generic cpufreq cooling implementation
>>> > a7a3b8c8 Fix a build error.
>>> > 204dd1d3 thermal: Fix potential NULL pointer accesses
>>> > 1e426ffdd thermal: add Renesas R-Car thermal sensor support
>>> > 79a49168 thermal: fix potential out-of-bounds memory access
>>> > f4a821ce6 Thermal: Introduce locking for cdev.thermal_instances list.
>>> > 908b9fb79 Thermal: Unify the code for both active and passive cooling
>>> > ce119f832 Th

Re: Fans at full speed after resume

2013-05-14 Thread Sonny Rao

On Tue, May 14, 2013 at 9:29 PM, Zhang Rui  wrote:
> On Wed, 2013-05-15 at 12:26 +0800, Zhang Rui wrote:
>> please
>>
>> On Tue, 2013-05-14 at 21:18 -0700, Sonny Rao wrote:
>> > Hi, I've seen a regression in kernels since 3.7 on x86 devices where
>> > the kernel turns the system fans on to max speed after resuming from
>> > ram.  Other people have noticed it as well, for example see
>> > https://bugzilla.redhat.com/show_bug.cgi?id=895276
>> >
>> please check if this is a duplicate of bug
>> https://bugzilla.kernel.org/show_bug.cgi?id=56591
> or you can try 3.10-rc1 to see if the problem still exists or not.

Ok, I patched in the fix from that bugzilla --
928c5edbe6f7cb0d1c71bc2353d091bc5b114fe3
but I'm still seeing the issue, I'll try 3.10-rc1 next

>
> thanks,
> rui
>> > For example on the Samsung 550 Chromebook, we have one thermal zone
>> > and have 5 cooling_devices, 0-4, which correspond to 5 possible fan
>> > speeds.  Under typical idle, only cooling_device4 and maybe
>> > cooling_device3 are active, depending on temperature:
>> >
>> > cat /sys/class/thermal/cooling_device[01234]/cur_state
>> > /sys/class/thermal/thermal_zone0/temp
>> > 0
>> > 0
>> > 0
>> > 0
>> > 1
>> > 57000
>> >
>> > however after a suspend/resume, we see that cooling_devices 0 and 1
>> > become active:
>> > cat /sys/class/thermal/cooling_device[01234]/cur_state
>> > /sys/class/thermal/thermal_zone0/temp
>> > 1
>> > 1
>> > 0
>> > 0
>> > 1
>> > 54000
>> >
>> > and it seems to stay that way, even though the temperature is low
>> > enough that the fan shouldn't be running at that speed.  If I manually
>> > disable cooling_devices 0 and 1 then fan control works normally again.
>> >
>> > I started bisecting it and was able to do so up until this commit:
>> > commit 29b19e250434c6193c8b8e4c34c9c6284dd4f101
>> > Merge: 125c4c7 c072fed
>> > Author: Len Brown 
>> > AuthorDate: Tue Oct 9 01:35:52 2012 -0400
>> > Commit: Len Brown 
>> > CommitDate: Tue Oct 9 01:35:52 2012 -0400
>> >
>> > Merge branch 'release' of
>> > git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux into
>> > thermal
>> >
>> > unfortunately, I'm not able to successfully do a suspend/resume on the
>> > commits in that merge, so I wasn't able to bisect down to the exact
>> > commit.
>> >
>> > I did confirm that one parent of the merge is okay: commit
>> > 125c4c706b680c7831f0966ff873c1ad0354ec25 idr: rename MAX_LEVEL to
>> > MAX_IDR_LEVEL
>> >
>> > so I think it falls somewhere in this list of commits:
>> > c072fed95c9855a920c114d7fa3351f0f54ea06e...e3f25e6e5836c4790fbe395ff42e241f372d859d
>> >
>> > c072fed9 thermal: Exynos: Fix NULL pointer dereference in
>> > exynos_unregister_thermal()
>> > a4b6fec9 Thermal: Fix bug on cpu_cooling, cooling device's id conflict 
>> > problem.
>> > 79e093c3 thermal: exynos: Use devm_* functions
>> > 17be868e ARM: exynos: add thermal sensor driver platform data support
>> > 7e0b55e6 thermal: exynos: register the tmu sensor with the kernel thermal 
>> > layer
>> > f22d9c03c thermal: exynos5: add exynos5250 thermal sensor driver support
>> > c48cbba6 hwmon: exynos4: move thermal sensor driver to driver/thermal 
>> > directory
>> > 02361418 thermal: add generic cpufreq cooling implementation
>> > a7a3b8c8 Fix a build error.
>> > 204dd1d3 thermal: Fix potential NULL pointer accesses
>> > 1e426ffdd thermal: add Renesas R-Car thermal sensor support
>> > 79a49168 thermal: fix potential out-of-bounds memory access
>> > f4a821ce6 Thermal: Introduce locking for cdev.thermal_instances list.
>> > 908b9fb79 Thermal: Unify the code for both active and passive cooling
>> > ce119f832 Thermal: Introduce simple arbitrator for setting device cooling 
>> > state
>> > b5e4ae62 Thermal: List thermal_instance in thermal_cooling_device.
>> > cddf31b3b Thermal: Rename thermal_instance.node to 
>> > thermal_instance.tz_node.
>> > 2d374139 Thermal: Rename thermal_zone_device.cooling_devices
>> > b81b6ba3 Thermal: rename structure thermal_cooling_device_instance to
>> > thermal_instance
>> > 4ae46befb Thermal: Introduce thermal_zone_trip_update()
>> > 1b7ddb84 Thermal: Remove tc1/tc2 in generic thermal layer.
>> > 601f3d424 Thermal: Introduce .get_trend() callback.
>> > 9d99842f9 Thermal: set upper and lower limits
>> > 74051ba5 Thermal: Introduce cooling states range support
>> >
>> > When I get time, I'll try to rebase those commits onto the IDR commit
>> > and see if I can get a better bisect.  Any insights into the problem
>> > would be appreciated, thanks.
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-pm" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Fans at full speed after resume

2013-05-14 Thread Sonny Rao

Hi, I've seen a regression in kernels since 3.7 on x86 devices where
the kernel turns the system fans on to max speed after resuming from
ram.  Other people have noticed it as well, for example see
https://bugzilla.redhat.com/show_bug.cgi?id=895276

For example on the Samsung 550 Chromebook, we have one thermal zone
and have 5 cooling_devices, 0-4, which correspond to 5 possible fan
speeds.  Under typical idle, only cooling_device4 and maybe
cooling_device3 are active, depending on temperature:

cat /sys/class/thermal/cooling_device[01234]/cur_state
/sys/class/thermal/thermal_zone0/temp
0
0
0
0
1
57000

however after a suspend/resume, we see that cooling_devices 0 and 1
become active:
cat /sys/class/thermal/cooling_device[01234]/cur_state
/sys/class/thermal/thermal_zone0/temp
1
1
0
0
1
54000

and it seems to stay that way, even though the temperature is low
enough that the fan shouldn't be running at that speed.  If I manually
disable cooling_devices 0 and 1 then fan control works normally again.

I started bisecting it and was able to do so up until this commit:
commit 29b19e250434c6193c8b8e4c34c9c6284dd4f101
Merge: 125c4c7 c072fed
Author: Len Brown 
AuthorDate: Tue Oct 9 01:35:52 2012 -0400
Commit: Len Brown 
CommitDate: Tue Oct 9 01:35:52 2012 -0400

Merge branch 'release' of
git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux into
thermal

unfortunately, I'm not able to successfully do a suspend/resume on the
commits in that merge, so I wasn't able to bisect down to the exact
commit.

I did confirm that one parent of the merge is okay: commit
125c4c706b680c7831f0966ff873c1ad0354ec25 idr: rename MAX_LEVEL to
MAX_IDR_LEVEL

so I think it falls somewhere in this list of commits:
c072fed95c9855a920c114d7fa3351f0f54ea06e...e3f25e6e5836c4790fbe395ff42e241f372d859d

c072fed9 thermal: Exynos: Fix NULL pointer dereference in
exynos_unregister_thermal()
a4b6fec9 Thermal: Fix bug on cpu_cooling, cooling device's id conflict problem.
79e093c3 thermal: exynos: Use devm_* functions
17be868e ARM: exynos: add thermal sensor driver platform data support
7e0b55e6 thermal: exynos: register the tmu sensor with the kernel thermal layer
f22d9c03c thermal: exynos5: add exynos5250 thermal sensor driver support
c48cbba6 hwmon: exynos4: move thermal sensor driver to driver/thermal directory
02361418 thermal: add generic cpufreq cooling implementation
a7a3b8c8 Fix a build error.
204dd1d3 thermal: Fix potential NULL pointer accesses
1e426ffdd thermal: add Renesas R-Car thermal sensor support
79a49168 thermal: fix potential out-of-bounds memory access
f4a821ce6 Thermal: Introduce locking for cdev.thermal_instances list.
908b9fb79 Thermal: Unify the code for both active and passive cooling
ce119f832 Thermal: Introduce simple arbitrator for setting device cooling state
b5e4ae62 Thermal: List thermal_instance in thermal_cooling_device.
cddf31b3b Thermal: Rename thermal_instance.node to thermal_instance.tz_node.
2d374139 Thermal: Rename thermal_zone_device.cooling_devices
b81b6ba3 Thermal: rename structure thermal_cooling_device_instance to
thermal_instance
4ae46befb Thermal: Introduce thermal_zone_trip_update()
1b7ddb84 Thermal: Remove tc1/tc2 in generic thermal layer.
601f3d424 Thermal: Introduce .get_trend() callback.
9d99842f9 Thermal: set upper and lower limits
74051ba5 Thermal: Introduce cooling states range support

When I get time, I'll try to rebase those commits onto the IDR commit
and see if I can get a better bisect.  Any insights into the problem
would be appreciated, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: IO regression after ab8fabd46f on x86 kernels with high memory

2013-05-01 Thread Sonny Rao

On Mon, Apr 29, 2013 at 3:08 PM, Pierre-Loup A. Griffais
 wrote:
> On 04/29/2013 03:03 PM, Linus Torvalds wrote:
>>
>> On Mon, Apr 29, 2013 at 2:53 PM, Pierre-Loup A. Griffais
>>  wrote:
>>>
>>>
>>> Other than this particular concern, what's the high-level take-away? Is
>>> PAE
>>> support in the Linux kernel a false promise than distros should not be
>>> shipping by default, if at all? Should it be removed from the kernel
>>> entirely if these configurations are knowingly broken by commits like
>>> this?
>>
>>
>> PAE is "make it barely work". The whole concept is fundamentally
>> flawed, and anybody who runs a 32-bit kernel with 16GB or RAM doesn't
>> even understand *how* flawed and stupid that is.
>>
>> Don't do it. Upgrade to 64-bit, or live with the fact that IO
>> performance will suck. The fact that it happened to work better under
>> your particular load with one particular IO size is entirely just
>> "random noise".
>>
>> Yeah, the difference between "we can cache it" and "we have to do IO"
>> is huge. With a 32-bit kernel, we do IO much earlier now, just to
>> avoid some really nasty situations. That makes you go from the "can
>> sit in the cache" to the "do lots of IO" situation. Tough.
>>
>> Seriously, you can compile yourself a 64-bit kernel and continue to
>> use your 32-bit user-land. And you can complain to whatever distro you
>> used that it didn't do that in the first place. But we're not going to
>> bother with trying to tune PAE for some particular load. It's just not
>> worth it to anybody.
>
>
> All of this came from me trying to reproduce slowdowns reported by other
> people; I personally run a 64-bit kernel and understand how bad of an idea
> it is to attempt to run 32-bit kernels with PAE enabled on modern machines.
> However, my goal is to avoid ending up with a variety of end-users that
> don't necessarily understand this getting bitten by it and breaking their
> systems by upgrading their kernels. I will indeed bring this up with
> distributors and point out than shipping PAE kernels by default is not a
> good idea given these problems and your stance on the matter.
>

Sorry just saw this (my stupid gmail filters for lkml) The slow-down
we ran into wasn't even on PAE -- it was *just* with highmem on a 2GB
system.  The non-zero amount (90MB? or so) of highmem was enough to
cause major problems due to that particular underflow.

I would say regardless of how much memory you have, if the system can
use a 64-bit kernel, then it almost certainly should.  I've seen some
very minor performance impacts on 64-bit capable Atom systems with
tiny L2 caches, but it's almost in the noise and not worth the pain.

> Thanks,
>  - Pierre-Loup
>
>>
>>  Linus
>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] mm: forcely swapout when we are out of page cache

2013-01-16 Thread Sonny Rao

On Tue, Jan 15, 2013 at 8:47 PM, Minchan Kim  wrote:
> On Tue, Jan 15, 2013 at 05:21:15PM -0800, Sonny Rao wrote:
>> On Tue, Jan 15, 2013 at 4:50 PM, Andrew Morton
>>  wrote:
>> > On Tue, 15 Jan 2013 16:32:38 -0800
>> > Sonny Rao  wrote:
>> >
>> >> >> It's for saving the power to increase batter life.
>> >> >
>> >> > It might well have that effect, dunno.  That wasn't my intent.  Testing
>> >> > needed!
>> >> >
>> >>
>> >> Power saving is certainly why we had it on originally for ChromeOS,
>> >> but we turned it off due to misbehavior.
>> >>
>> >> Specifically, we saw a pathological behavior where we'd end up writing
>> >> to the disk every few seconds when laptop mode was turned on.  This
>> >> turned out to be because laptop-mode sets a timer which is used to
>> >> check for new dirty data after the initial flush and writes that out
>> >> before spinning the disk down, and on ChromeOS various chatty daemons
>> >> on the system were logging and dirtying data more or less constantly
>> >> so there was almost always something there to be written out.  So what
>> >> ended up happening was that we'd need to do a read, then wake up the
>> >> disk, and then keep writing every few seconds for a long period of
>> >> time, which had the opposite effect from what we wanted.
>> >
>> > So after the read, the disk would chatter away doing a dribble of
>> > writes?  That sounds like plain brokenness (and why did the chrome guys
>> > not tell anyone about it?!?!?).
>>
>> Yes, either read or fsync.  I ranted about it a little (here:
>> http://marc.info/?l=linux-mm&m=135422986220016&w=4), but mostly
>> assumed it was working as expected, and that ChromeOS was just
>> dirtying data at an absurd pace.  Might have been a bad assumption and
>> I could have been more explicit about reporting it, sorry about that.
>>
>> > The idea is that when the physical
>> > read occurs, we should opportunistically flush out all pending writes,
>> > while the disk is running.  Then go back into
>> > buffer-writes-for-a-long-time mode.
>> >
>>
>> See the comment in page-writeback.c above laptop_io_completion():
>>
>> /*
>>  * We've spun up the disk and we're in laptop mode: schedule writeback
>>  * of all dirty data a few seconds from now.  If the flush is already
>> scheduled
>>  * then push it back - the user is still using the disk.
>>  */
>> void laptop_io_completion(struct backing_dev_info *info)
>>
>> What ends up happening fairly often is that there's always something
>> dirty with that few seconds (or even one second) on our system.
>>
>> > I forget what we did with fsync() and friends.  Quite a lot of
>> > pestiferous applications like to do fsync quite frequently.  I had a
>> > special kernel in which fsync() consisted of "return 0;", but ISTR
>> > there being some resistance to productizing that idea.
>> >
>>
>> Yeah, we have this problem and we try to fix up users of fsync() as we
>> find them but it's a bit of a never-ending battle.  Such a feature
>> would be useful.
>>
>> >>  The issues
>> >> with zram swap just confirmed that we didn't want laptop mode.
>> >>
>> >> Most of our devices have had SSDs rather than spinning disks, so noise
>> >> wasn't an issue, although when we finally did support an official
>> >> device with a spinning disk people certainly complained when the disk
>> >> started clicking all the time
>> >
>> > hm, it's interesting that the general idea still has vailidity.  It
>> > would be a fun project for someone to sniff out all the requirements,
>> > fixup/enhance/rewrite the current implementation and generally make it
>> > all spiffy and nice.
>> >
>> >> (due to the underflow in the writeback code).
>> >
>> > To what underflow do you refer?
>> >
>> http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=c8b74c2f6604923de91f8aa6539f8bb934736754
>>
>> That particular bug caused writes to happen almost instantly after the
>> underflow ocurred, and consequently slowed write throughput to a crawl
>> because there was no chance for contiguous writes to gather.
>>
>> >> We do know that current SSDs save a significant amount of
&g

1 2 >

1 - 100 of 130 matches

Mail list logo