date:20131114

[PATCH net v3 0/4] r8152 bug fixes

2013-11-14 Thread Hayes Wang

I split the code of stopping/waking tx queue into the another patch.

Hayes Wang (4):
  r8152: fix tx/rx memory overflow
  r8152: modify the tx flow
  r8152: support stopping/waking tx queue
  r8152: fix incorrect type in assignment

 drivers/net/usb/r8152.c | 109 
 1 file changed, 45 insertions(+), 64 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH net v3 1/4] r8152: fix tx/rx memory overflow

2013-11-14 Thread Hayes Wang

The tx/rx would access the memory which is out of the desired range.
Modify the method of checking the end of the memory to avoid it.

For r8152_tx_agg_fill(), the variable remain may become negative.
However, the declaration is unsigned, so the while loop wouldn't
break when reaching the end of the desied memory. Although to change
the declaration from unsigned to signed is enough to fix it, I also
modify the checking method for safe. Replace

remain = rx_buf_sz - sizeof(*tx_desc) -
 (u32)((void *)tx_data - agg->head);

with

remain = rx_buf_sz - (int)(tx_agg_align(tx_data) - agg->head);

to make sure the variable remain is always positive. Then, the
overflow wouldn't happen.

For rx_bottom(), the rx_desc should not be used to calculate the
packet length before making sure the rx_desc is in the desired range.
Change the checking to two parts. First, check the descriptor is in
the memory. The other, using the descriptor to find out the packet
length and check if the packet is in the memory.

Signed-off-by: Hayes Wang 
---
 drivers/net/usb/r8152.c | 30 +-
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index f3fce41..5dbfe50 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -24,7 +24,7 @@
 #include 
 
 /* Version Information */
-#define DRIVER_VERSION "v1.01.0 (2013/08/12)"
+#define DRIVER_VERSION "v1.02.0 (2013/10/28)"
 #define DRIVER_AUTHOR "Realtek linux nic maintainers "
 #define DRIVER_DESC "Realtek RTL8152 Based USB 2.0 Ethernet Adapters"
 #define MODULENAME "r8152"
@@ -1136,14 +1136,14 @@ r8152_tx_csum(struct r8152 *tp, struct tx_desc *desc, 
struct sk_buff *skb)
 
 static int r8152_tx_agg_fill(struct r8152 *tp, struct tx_agg *agg)
 {
-   u32 remain;
+   int remain;
u8 *tx_data;
 
tx_data = agg->head;
agg->skb_num = agg->skb_len = 0;
-   remain = rx_buf_sz - sizeof(struct tx_desc);
+   remain = rx_buf_sz;
 
-   while (remain >= ETH_ZLEN) {
+   while (remain >= ETH_ZLEN + sizeof(struct tx_desc)) {
struct tx_desc *tx_desc;
struct sk_buff *skb;
unsigned int len;
@@ -1152,12 +1152,14 @@ static int r8152_tx_agg_fill(struct r8152 *tp, struct 
tx_agg *agg)
if (!skb)
break;
 
+   remain -= sizeof(*tx_desc);
len = skb->len;
if (remain < len) {
skb_queue_head(>tx_queue, skb);
break;
}
 
+   tx_data = tx_agg_align(tx_data);
tx_desc = (struct tx_desc *)tx_data;
tx_data += sizeof(*tx_desc);
 
@@ -1167,9 +1169,8 @@ static int r8152_tx_agg_fill(struct r8152 *tp, struct 
tx_agg *agg)
agg->skb_len += len;
dev_kfree_skb_any(skb);
 
-   tx_data = tx_agg_align(tx_data + len);
-   remain = rx_buf_sz - sizeof(*tx_desc) -
-(u32)((void *)tx_data - agg->head);
+   tx_data += len;
+   remain = rx_buf_sz - (int)(tx_agg_align(tx_data) - agg->head);
}
 
usb_fill_bulk_urb(agg->urb, tp->udev, usb_sndbulkpipe(tp->udev, 2),
@@ -1188,7 +1189,6 @@ static void rx_bottom(struct r8152 *tp)
list_for_each_safe(cursor, next, >rx_done) {
struct rx_desc *rx_desc;
struct rx_agg *agg;
-   unsigned pkt_len;
int len_used = 0;
struct urb *urb;
u8 *rx_data;
@@ -1204,17 +1204,22 @@ static void rx_bottom(struct r8152 *tp)
 
rx_desc = agg->head;
rx_data = agg->head;
-   pkt_len = le32_to_cpu(rx_desc->opts1) & RX_LEN_MASK;
-   len_used += sizeof(struct rx_desc) + pkt_len;
+   len_used += sizeof(struct rx_desc);
 
-   while (urb->actual_length >= len_used) {
+   while (urb->actual_length > len_used) {
struct net_device *netdev = tp->netdev;
struct net_device_stats *stats;
+   unsigned pkt_len;
struct sk_buff *skb;
 
+   pkt_len = le32_to_cpu(rx_desc->opts1) & RX_LEN_MASK;
if (pkt_len < ETH_ZLEN)
break;
 
+   len_used += pkt_len;
+   if (urb->actual_length < len_used)
+   break;
+
stats = rtl8152_get_stats(netdev);
 
pkt_len -= 4; /* CRC */
@@ -1234,9 +1239,8 @@ static void rx_bottom(struct r8152 *tp)
 
rx_data = rx_agg_align(rx_data + pkt_len + 4);
rx_desc = (struct rx_desc *)rx_data;
-   pkt_len = le32_to_cpu(rx_desc->opts1) & RX_LEN_MASK;

Re: 3.10.16 cgroup_mutex deadlock

2013-11-14 Thread Tejun Heo

Hello,

Shawn, Hugh, can you please verify whether the attached patch makes
the deadlock go away?

Thanks.

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e0839bc..dc9dc06 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -90,6 +90,14 @@ static DEFINE_MUTEX(cgroup_mutex);
 static DEFINE_MUTEX(cgroup_root_mutex);
 
 /*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions.  Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
+
+/*
  * Generate an array of cgroup subsystem pointers. At boot time, this is
  * populated with the built in subsystems, and modular subsystems are
  * registered after that. The mutable section of this array is protected by
@@ -871,7 +879,7 @@ static void cgroup_free_rcu(struct rcu_head *head)
struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
 
INIT_WORK(>destroy_work, cgroup_free_fn);
-   schedule_work(>destroy_work);
+   queue_work(cgroup_destroy_wq, >destroy_work);
 }
 
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -4254,7 +4262,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
 * css_put().  dput() requires process context which we don't have.
 */
INIT_WORK(>destroy_work, css_free_work_fn);
-   schedule_work(>destroy_work);
+   queue_work(cgroup_destroy_wq, >destroy_work);
 }
 
 static void css_release(struct percpu_ref *ref)
@@ -4544,7 +4552,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
 
INIT_WORK(>destroy_work, css_killed_work_fn);
-   schedule_work(>destroy_work);
+   queue_work(cgroup_destroy_wq, >destroy_work);
 }
 
 /**
@@ -5025,6 +5033,17 @@ int __init cgroup_init(void)
if (err)
return err;
 
+   /*
+* There isn't much point in executing destruction path in
+* parallel.  Good chunk is serialized with cgroup_mutex anyway.
+* Use 1 for @max_active.
+*/
+   cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+   if (!cgroup_destroy_wq) {
+   err = -ENOMEM;
+   goto out;
+   }
+
for_each_builtin_subsys(ss, i) {
if (!ss->early_init)
cgroup_init_subsys(ss);
@@ -5062,9 +5081,11 @@ int __init cgroup_init(void)
proc_create("cgroups", 0, NULL, _cgroupstats_operations);
 
 out:
-   if (err)
+   if (err) {
+   if (cgroup_destroy_wq)
+   destroy_workqueue(cgroup_destroy_wq);
bdi_destroy(_backing_dev_info);
-
+   }
return err;
 }
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [f2fs-dev][PATCH V2 4/6] f2fs: Key functions to handle inline data

2013-11-14 Thread Jaegeuk Kim

Hi Huajun,

[snip]

> +static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
> +{
> + int err;
> + struct page *ipage;
> + struct dnode_of_data dn;
> + void *src_addr, *dst_addr;
> + block_t old_blk_addr, new_blk_addr;
> + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
> +
> + f2fs_lock_op(sbi);
> + ipage = get_node_page(sbi, inode->i_ino);
> + if (IS_ERR(ipage))
> + return PTR_ERR(ipage);
> +
> + /*
> +  * i_addr[0] is not used for inline data,
> +  * so reserving new block will not destroy inline data
> +  */
> + set_new_dnode(, inode, ipage, ipage, 0);
> + err = f2fs_reserve_block(, 0);
> + if (err) {
> + f2fs_put_page(ipage, 1);
> + f2fs_unlock_op(sbi);
> + return err;
> + }
> +
> + src_addr = inline_data_addr(ipage);
> + dst_addr = page_address(page);
> + zero_user_segment(page, 0, PAGE_CACHE_SIZE);
> +
> + /* Copy the whole inline data block */
> + memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
> +
> + /* write data page to try to make data consistent */
> + old_blk_addr = dn.data_blkaddr;
> + set_page_writeback(page);
> + write_data_page(inode, page, ,
> + old_blk_addr, _blk_addr);
> + update_extent_cache(new_blk_addr, );
> + f2fs_wait_on_page_writeback(page, DATA, true);
> +
> + /* clear inline data and flag after data writeback */
> + zero_user_segment(ipage, INLINE_DATA_OFFSET,
> +  INLINE_DATA_OFFSET + MAX_INLINE_DATA);
> + clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
> +
> + sync_inode_page();
> + f2fs_put_page(ipage, 1);

Again, it seems that you missed what I mentioned.
If we write the inlined data block only, we cannot recover the data
block after SPO.
In order to avoid that, we should write its dnode block too by
triggering sync_node_pages(ino) at this point as similar as fsync
routine.

Thanks,

-- 
Jaegeuk Kim
Samsung

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:x86/urgent] x86: Export 'boot_cpu_physical_apicid' to modules

2013-11-14 Thread tip-bot for David Rientjes

Commit-ID:  cc08e04c3fdcaab767b8db27527002b2b4d758cb
Gitweb: http://git.kernel.org/tip/cc08e04c3fdcaab767b8db27527002b2b4d758cb
Author: David Rientjes 
AuthorDate: Thu, 14 Nov 2013 15:05:32 -0800
Committer:  Ingo Molnar 
CommitDate: Fri, 15 Nov 2013 08:38:30 +0100

x86: Export 'boot_cpu_physical_apicid' to modules

Commit 9ebddac7ea2a "ACPI, x86: Fix extended error log driver to depend on
CONFIG_X86_LOCAL_APIC" fixed a build error when CONFIG_X86_LOCAL_APIC was not
selected and !CONFIG_SMP.

However, since CONFIG_ACPI_EXTLOG is tristate, there is a second build error:

  ERROR: "boot_cpu_physical_apicid" [drivers/acpi/acpi_extlog.ko] undefined!

The symbol needs to be exported for it to be available.

Signed-off-by: David Rientjes 
Acked-by: Tony Luck 
Cc: Chen Gong 
Cc: Rafael J. Wysocki 
Link: 
http://lkml.kernel.org/r/alpine.deb.2.02.1311141504080.30...@chino.kir.corp.google.com
[ Changed it to a _GPL() export. ]
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/apic/apic.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a7eb82d..befe498 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -62,6 +62,7 @@ unsigned disabled_cpus;
 
 /* Processor that is doing the boot up */
 unsigned int boot_cpu_physical_apicid = -1U;
+EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
 
 /*
  * The highest APIC ID seen during enumeration.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCHv5 06/20] hwmon: lm75: expose to thermal fw via DT nodes

2013-11-14 Thread Jean Delvare

Hi Eduardo,

Sorry for joining the discussion a little late, I could never find the
time to look into this patch series so far.

On Tue, 12 Nov 2013 15:46:08 -0400, Eduardo Valentin wrote:
> This patch adds to lm75 temperature sensor the possibility
> to expose itself as thermal zone device, registered on the
> thermal framework.
> 
> The thermal zone is built only if a device tree node
> describing a thermal zone for this sensor is present
> inside the lm75 DT node. Otherwise, the driver behavior
> will be the same.
> 
> Cc: Jean Delvare 
> Cc: lm-sens...@lm-sensors.org
> Cc: linux-kernel@vger.kernel.org
> Acked-by: Guenter Roeck 
> Signed-off-by: Eduardo Valentin 
> ---
>  drivers/hwmon/lm75.c | 35 ++-
>  1 file changed, 30 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/hwmon/lm75.c b/drivers/hwmon/lm75.c
> index c03b490..1d3600a 100644
> --- a/drivers/hwmon/lm75.c
> +++ b/drivers/hwmon/lm75.c
> @@ -27,6 +27,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  #include "lm75.h"
>  
>  
> @@ -70,6 +72,7 @@ static const u8 LM75_REG_TEMP[3] = {
>  /* Each client has this additional data */
>  struct lm75_data {
>   struct device   *hwmon_dev;
> + struct thermal_zone_device  *tz;
>   struct mutexupdate_lock;
>   u8  orig_conf;
>   u8  resolution; /* In bits, between 9 and 12 */
> @@ -90,22 +93,36 @@ static struct lm75_data *lm75_update_device(struct device 
> *dev);
>  
>  /*---*/
>  
> +static inline long lm75_reg_to_mc(s16 temp, u8 resolution)
> +{
> + return ((temp >> (16 - resolution)) * 1000) >> (resolution - 8);
> +}
> +
>  /* sysfs attributes for hwmon */
>  
> +static int lm75_read_temp(void *dev, long *temp)
> +{
> + struct lm75_data *data = lm75_update_device(dev);
> +
> + if (IS_ERR(data))
> + return PTR_ERR(data);
> +
> + *temp = lm75_reg_to_mc(data->temp[0], data->resolution);
> +
> + return 0;
> +}
> +
>  static ssize_t show_temp(struct device *dev, struct device_attribute *da,
>char *buf)
>  {
>   struct sensor_device_attribute *attr = to_sensor_dev_attr(da);
>   struct lm75_data *data = lm75_update_device(dev);
> - long temp;
>  
>   if (IS_ERR(data))
>   return PTR_ERR(data);
>  
> - temp = ((data->temp[attr->index] >> (16 - data->resolution)) * 1000)
> ->> (data->resolution - 8);
> -
> - return sprintf(buf, "%ld\n", temp);
> + return sprintf(buf, "%ld\n", lm75_reg_to_mc(data->temp[attr->index],
> + data->resolution));
>  }

I am a bit worried by this. I did expect that you'd have to split a
piece of show_temp() into one separate function, not two. Here you have
quite some redundancy between lm75_read_temp and show_temp. Sure these
are small functions so the duplicate code is limited, but if you
multiply by the number of hwmon drivers which are candidates for this
change, this all adds up.

>  
>  static ssize_t set_temp(struct device *dev, struct device_attribute *da,
> @@ -271,6 +288,13 @@ lm75_probe(struct i2c_client *client, const struct 
> i2c_device_id *id)
>   goto exit_remove;
>   }
>  
> + data->tz = thermal_zone_of_sensor_register(>dev,
> +0,
> +>dev,
> +lm75_read_temp, NULL);

I am also worried by this interface. Basically a separate callback
function (here lm75_read_temp) is needed for every thermal input. Some
hwmon drivers have many of these! This will result in duplicate code
again. If you could just pass a sensor ID as an extra parameter to
lm75_read_temp, you could use the same callback for all sensors. This
would maybe also let you refactor the code above, as I believe
show_temp would then be able to call lm75_read_temp(dev, , 0)
instead of reimplementing most of it.

I also note the lack of support for limits. Thermal zones can have
limits, and the various hwmon drivers do support that, yet your
interface offers no possibility to expose them. Wouldn't that be useful?

> + if (IS_ERR(data->tz))
> + data->tz = NULL;

If you are doing that in all drivers, I would question the point of
having thermal_zone_of_sensor_register() return a PTR_ERR in the first
place.

> +
>   dev_info(>dev, "%s: sensor '%s'\n",
>dev_name(data->hwmon_dev), client->name);
>  
> @@ -285,6 +309,7 @@ static int lm75_remove(struct i2c_client *client)
>  {
>   struct lm75_data *data = i2c_get_clientdata(client);
>  
> + thermal_zone_of_sensor_unregister(>dev, data->tz);
>   hwmon_device_unregister(data->hwmon_dev);
>   sysfs_remove_group(>dev.kobj, _group);
>   lm75_write_value(client, LM75_REG_CONF, data->orig_conf);

--

RE: [PATCH v6 3/3] dma: Add Freescale eDMA engine driver support

2013-11-14 Thread Jingchang Lu



> -Original Message-
> From: Mark Rutland [mailto:mark.rutl...@arm.com]
> Sent: Thursday, November 14, 2013 6:46 PM
> To: Lu Jingchang-B35083
> Cc: vinod.k...@intel.com; devicet...@vger.kernel.org; Wang Huan-B18965;
> linux-kernel@vger.kernel.org; shawn@linaro.org; linux-arm-
> ker...@lists.infradead.org
> Subject: Re: [PATCH v6 3/3] dma: Add Freescale eDMA engine driver support
> 
> On Wed, Sep 18, 2013 at 10:57:59AM +0100, Jingchang Lu wrote:
> > Add Freescale enhanced direct memory(eDMA) controller support.
> > The eDMA controller deploys DMAMUXs routing DMA request sources(slot)
> > to eDMA channels.
> > This module can be found on Vybrid and LS-1 SoCs.
> >
> > Signed-off-by: Alison Wang 
> > Signed-off-by: Jingchang Lu 
[...] 
> > +* DMAMUX
> > +Required properties:
> 
> No compatible?
> 
> Where are DMAMUX nodes expected to live?
> 
> How to they relate to the eDMA controller in HW? Are they a
> subcomponent, or a logically separate unit that happens to be connected?
[Lu Jingchang-b35083] 
DMAMUX is a multiplexer between dma controller channels and peripheral deivces,
each DMAMUX provides 16 independently selectable DMA channel routers, and each 
channel router can be assigned to one of the possible peripheral DMA slots.
So it's not a standalone device, it's just required by the DMA controller to
connect the channels and slaves, So it's got by DMA controller's "fsl,dma-mux" 
property.
Thanks!
> 
> > +- reg : Should contain DMAMUX registers location and length
> > +- fsl,dmamux-id : DMAMUX ID. DMAMUX IDs are unique in each eDMA
> controller.
> > +  inside one eDMA controller, specific request source can only be
> routed by
> > +  one of its DMAMUXs.
> > +  However Specific request source may be routed to different eDMA
> controller,
> > +  thus all the DMAMUXs sharing a the same request sources have the
> same ID.
> > +- clocks : Phandle of the clock used by the DMAMUX
> > +- clock-names : The clock names
> 
> _which_ clock names do you expect? From the looks of the example, you
> expect "dmamux".
> 
> From the view of the DMAMUX, what is its clock input called? "dmamux"
> doesn't look like what I'd expect for a clock name, but if the
> documentation for the eDMA doesn't provide a name for it, "dmamux" is
> fine.
> 
> If you're not using clock-names, it's useless. You _must_ define the set
> of names you expect or it's unusable. If you do define a set of names,
> then you should request clocks by name in the driver.
> 
[Lu Jingchang-b35083] [Lu Jingchang-b35083] 
The clock here is from the platform bus clock, I will remove this property for
compatible between SoCs. Thanks!
> > +

[...] 

> > +* DMA clients
> > +DMA client drivers that uses the DMA function must use the format
> described
> > +in the dma.txt file, using a three-cell specifier for each channel: a
> phandle
> > +plus two integer cells as described above.
> 
> Nit: the phandle isn't part of the specifier. The cells after the
> phandle are the specifier associated with the phandle.
> 
[Lu Jingchang-b35083] 
I will fix this, thanks!
> [...]
> 
> > +static bool fsl_edma_filter_fn(struct dma_chan *chan, void *mux_id)
> > +{
> > +   struct fsl_edma_chan *fsl_chan = to_fsl_edma_chan(chan);
> > +
> > +   if (fsl_chan->edmamux->mux_id != (u32)mux_id)
> > +   return false;
> > +
> > +   return true;
> 
> Why not:
> 
> return fsl_chan->edmamux->mux_id == (u32)mux_id;
> 
[Lu Jingchang-b35083] 
I will replace it, thanks!
> [...]
> 
> > +static int
> > +fsl_init_edmamux(struct platform_device *pdev, struct fsl_edma_engine
> *fsl_edma)
> > +{
> > +   struct device_node *np = pdev->dev.of_node;
> > +   struct fsl_edma_dmamux *fsl_edmamux;
> > +   struct device_node *phandle;
> 
> That's confusing, a node is not a phandle. Why not mux_np?
[Lu Jingchang-b35083] 
Ok, I will eliminate the confusing, thanks!
> 
> > +   const void *prop;
> > +   struct resource res;
> > +   int len, n_muxes, chans_per_mux, ch_off;
> > +   int i, j;
> > +   int ret;
> > +
> > +   prop = of_get_property(np, "fsl,dma-mux", );
> > +   if (!prop) {
> > +   dev_err(>dev, "Can't get DMAMUX.\n");
> > +   return -EINVAL;
> > +   }
> > +
> > +   n_muxes = len / sizeof(u32);
> 
> It would be nicer if we had a variant of of_count_phandle_with_args that
> cound handle a fixed count of 0 args for this sort of thing.
[Lu Jingchang-b35083] 
Yes, I will do it, thanks!
> 
> > +   chans_per_mux = fsl_edma->n_chans / n_muxes;
> > +   fsl_edmamux = devm_kzalloc(>dev,
> > +   sizeof(struct fsl_edma_dmamux) * n_muxes,
> GFP_KERNEL);
> > +   if (!fsl_edmamux)
> > +   return -ENOMEM;
> > +
> > +   fsl_edma->n_muxes = 0;
> > +   fsl_edma->edmamux = fsl_edmamux;
> > +   for (i = 0; i < n_muxes; i++) {
> > +   phandle = of_parse_phandle(np, "fsl,dma-mux", i);
> > +   ret = of_address_to_resource(phandle, 0, );
> > +

[tip:perf/urgent] perf record: Add an option to force per-cpu mmaps

2013-11-14 Thread tip-bot for Adrian Hunter

Commit-ID:  539e6bb71e350541105e67e3d6c31392d9da25ef
Gitweb: http://git.kernel.org/tip/539e6bb71e350541105e67e3d6c31392d9da25ef
Author: Adrian Hunter 
AuthorDate: Fri, 1 Nov 2013 15:51:34 +0200
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 14 Nov 2013 16:10:27 -0300

perf record: Add an option to force per-cpu mmaps

By default, when tasks are specified (i.e. -p, -t or -u options)
per-thread mmaps are created.

Add an option to override that and force per-cpu mmaps.

Further comments by peterz:

So this option allows -t/-p/-u to create one buffer per cpu and attach
all the various thread/process/user tasks' their counters to that one
buffer?

As opposed to the current state where each such counter would have its
own buffer.

Signed-off-by: Adrian Hunter 
Tested-by: Sukadev Bhattiprolu 
Acked-by: Peter Zijlstra 
Cc: David Ahern 
Cc: Frederic Weisbecker 
Cc: Ingo Molnar 
Cc: Jiri Olsa 
Cc: Mike Galbraith 
Cc: Namhyung Kim 
Cc: Paul Mackerras 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Link: 
http://lkml.kernel.org/r/1383313899-15987-7-git-send-email-adrian.hun...@intel.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/Documentation/perf-record.txt | 6 ++
 tools/perf/builtin-record.c  | 2 ++
 tools/perf/util/evlist.c | 4 +++-
 tools/perf/util/evsel.c  | 4 ++--
 tools/perf/util/target.h | 1 +
 5 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tools/perf/Documentation/perf-record.txt 
b/tools/perf/Documentation/perf-record.txt
index 052f7c4..43b42c4 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -201,6 +201,12 @@ abort events and some memory events in precise mode on 
modern Intel CPUs.
 --transaction::
 Record transaction flags for transaction related events.
 
+--force-per-cpu::
+Force the use of per-cpu mmaps.  By default, when tasks are specified (i.e. -p,
+-t or -u options) per-thread mmaps are created.  This option overrides that and
+forces per-cpu mmaps.  A side-effect of that is that inheritance is
+automatically enabled.  Add the -i option also to disable inheritance.
+
 SEE ALSO
 
 linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 4d644fe..7c8020a 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -888,6 +888,8 @@ const struct option record_options[] = {
"sample by weight (on special events only)"),
OPT_BOOLEAN(0, "transaction", _transaction,
"sample transaction flags (special events only)"),
+   OPT_BOOLEAN(0, "force-per-cpu", _per_cpu,
+   "force the use of per-cpu mmaps"),
OPT_END()
 };
 
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 5ce2ace..bbc746a 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -819,7 +819,9 @@ int perf_evlist__create_maps(struct perf_evlist *evlist, 
struct target *target)
if (evlist->threads == NULL)
return -1;
 
-   if (target__has_task(target))
+   if (target->force_per_cpu)
+   evlist->cpus = cpu_map__new(target->cpu_list);
+   else if (target__has_task(target))
evlist->cpus = cpu_map__dummy_new();
else if (!target__has_cpu(target) && !target->uses_mmap)
evlist->cpus = cpu_map__dummy_new();
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 18f7c18..46dd4c2 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -645,7 +645,7 @@ void perf_evsel__config(struct perf_evsel *evsel,
}
}
 
-   if (target__has_cpu(>target))
+   if (target__has_cpu(>target) || opts->target.force_per_cpu)
perf_evsel__set_sample_bit(evsel, CPU);
 
if (opts->period)
@@ -653,7 +653,7 @@ void perf_evsel__config(struct perf_evsel *evsel,
 
if (!perf_missing_features.sample_id_all &&
(opts->sample_time || !opts->no_inherit ||
-target__has_cpu(>target)))
+target__has_cpu(>target) || opts->target.force_per_cpu))
perf_evsel__set_sample_bit(evsel, TIME);
 
if (opts->raw_samples) {
diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h
index 89bab71..2d0c506 100644
--- a/tools/perf/util/target.h
+++ b/tools/perf/util/target.h
@@ -12,6 +12,7 @@ struct target {
uid_tuid;
bool system_wide;
bool uses_mmap;
+   bool force_per_cpu;
 };
 
 enum target_errno {
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3] epoll: drop EPOLLWAKEUP if PM_SLEEP is disabled

2013-11-14 Thread Amit Pundir

Drop EPOLLWAKEUP from epoll events mask if CONFIG_PM_SLEEP is disabled.

Signed-off-by: Amit Pundir 
---
Changes in v3:
 Renamed ep_epollwakeup_check() to ep_take_care_of_epollwakeup().
 Didn't update ep_create_wakeup_source() to return -ENOSYS if PM_SLEEP is unset.
---
 fs/eventpoll.c |3 +--
 include/uapi/linux/eventpoll.h |   13 -
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 473e09d..dbf382b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1820,8 +1820,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
goto error_tgt_fput;
 
/* Check if EPOLLWAKEUP is allowed */
-   if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
-   epds.events &= ~EPOLLWAKEUP;
+   ep_take_care_of_epollwakeup();
 
/*
 * We have to check that the file structure underneath the file 
descriptor
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 2c267bc..bc81fb2 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -61,5 +61,16 @@ struct epoll_event {
__u64 data;
 } EPOLL_PACKED;
 
-
+#ifdef CONFIG_PM_SLEEP
+static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
+{
+   if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
+   epev->events &= ~EPOLLWAKEUP;
+}
+#else
+static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
+{
+   epev->events &= ~EPOLLWAKEUP;
+}
+#endif
 #endif /* _UAPI_LINUX_EVENTPOLL_H */
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:perf/urgent] perf ui browser: Fix segfault caused by off by one handling END key

2013-11-14 Thread tip-bot for Arnaldo Carvalho de Melo

Commit-ID:  48d038fcd09fa231e254965c3b69f8f640c9e62d
Gitweb: http://git.kernel.org/tip/48d038fcd09fa231e254965c3b69f8f640c9e62d
Author: Arnaldo Carvalho de Melo 
AuthorDate: Thu, 14 Nov 2013 15:30:41 -0300
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 14 Nov 2013 16:00:31 -0300

perf ui browser: Fix segfault caused by off by one handling END key

$ perf record ls
$ perf report

Press 'down enter end'

Result:

Program received signal SIGSEGV, Segmentation fault.

The UI browser, used on a argv array would access past the end of the
array on SEEK_END because it wasn't using 'nr_entries - 1', fix it.

Reported-by: v.kar...@samsung.com
Cc: Adrian Hunter 
Cc: David Ahern 
Cc: Frederic Weisbecker 
Cc: Jiri Olsa 
Cc: Mike Galbraith 
Cc: Namhyung Kim 
Cc: Paul Mackerras 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=59291
Link: http://lkml.kernel.org/n/tip-3g83ipasqi219ktv764xz...@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/ui/browser.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c
index 3648d4e..cbaa7af 100644
--- a/tools/perf/ui/browser.c
+++ b/tools/perf/ui/browser.c
@@ -569,7 +569,7 @@ void ui_browser__argv_seek(struct ui_browser *browser, 
off_t offset, int whence)
browser->top = browser->top + browser->top_idx + offset;
break;
case SEEK_END:
-   browser->top = browser->top + browser->nr_entries + offset;
+   browser->top = browser->top + browser->nr_entries - 1 + offset;
break;
default:
return;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/urgent] MAINTAINERS: Update file patterns in the lockdep and scheduler entries

2013-11-14 Thread tip-bot for Joe Perches

Commit-ID:  7486d6da09d4d76d49c68826c97578246936092b
Gitweb: http://git.kernel.org/tip/7486d6da09d4d76d49c68826c97578246936092b
Author: Joe Perches 
AuthorDate: Thu, 14 Nov 2013 14:59:45 -0800
Committer:  Ingo Molnar 
CommitDate: Fri, 15 Nov 2013 07:15:26 +0100

MAINTAINERS: Update file patterns in the lockdep and scheduler entries

Propagate the file movement effects of the following commits:

  7a6354e241d8 sched: Move wait.c into kernel/sched/
  8eddac3f1037 locking: Move the lockdep code to kernel/locking/

Signed-off-by: Joe Perches 
Cc: Peter Zijlstra 
Link: http://lkml.kernel.org/r/1384469985.2897.32.camel@joe-AO722
Signed-off-by: Ingo Molnar 
---
 MAINTAINERS | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index f3ef1d1..3f85561 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5236,7 +5236,7 @@ S:Maintained
 F: Documentation/lockdep*.txt
 F: Documentation/lockstat.txt
 F: include/linux/lockdep.h
-F: kernel/lockdep*
+F: kernel/locking/
 
 LOGICAL DISK MANAGER SUPPORT (LDM, Windows 2000/XP/Vista Dynamic Disks)
 M: "Richard Russon (FlatCap)" 
@@ -7361,7 +7361,6 @@ S:Maintained
 F: kernel/sched/
 F: include/linux/sched.h
 F: include/uapi/linux/sched.h
-F: kernel/wait.c
 F: include/linux/wait.h
 
 SCORE ARCHITECTURE
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:perf/urgent] perf symbols: Limit max callchain using max_stack on DWARF unwinding too

2013-11-14 Thread tip-bot for Arnaldo Carvalho de Melo

Commit-ID:  37676af15c8d5a9689c9d1220d2a27d510cbe238
Gitweb: http://git.kernel.org/tip/37676af15c8d5a9689c9d1220d2a27d510cbe238
Author: Arnaldo Carvalho de Melo 
AuthorDate: Wed, 13 Nov 2013 17:40:36 -0300
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 14 Nov 2013 16:00:23 -0300

perf symbols: Limit max callchain using max_stack on DWARF unwinding too

It was affecting only frame-pointer (fp) based callchain processing.

Usage example:

  perf top --call-graph dwarf,1024 --max-stack 2

Works for any tool that does callchain resolving and provides a
--max-stack option.

Cc: Adrian Hunter 
Cc: David Ahern 
Cc: Frederic Weisbecker 
Cc: Jiri Olsa 
Cc: Mike Galbraith 
Cc: Namhyung Kim 
Cc: Paul Mackerras 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Cc: Waiman Long 
Link: http://lkml.kernel.org/n/tip-eu45v8s3tq9ruay8tpfyo...@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/machine.c | 2 +-
 tools/perf/util/unwind.c  | 9 +
 tools/perf/util/unwind.h  | 5 +++--
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 0393912..84cdb07 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1368,7 +1368,7 @@ int machine__resolve_callchain(struct machine *machine,
 
return unwind__get_entries(unwind_entry, _cursor, machine,
   thread, evsel->attr.sample_regs_user,
-  sample);
+  sample, max_stack);
 
 }
 
diff --git a/tools/perf/util/unwind.c b/tools/perf/util/unwind.c
index 5390d0b..0efd539 100644
--- a/tools/perf/util/unwind.c
+++ b/tools/perf/util/unwind.c
@@ -559,7 +559,7 @@ static unw_accessors_t accessors = {
 };
 
 static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
-  void *arg)
+  void *arg, int max_stack)
 {
unw_addr_space_t addr_space;
unw_cursor_t c;
@@ -575,7 +575,7 @@ static int get_entries(struct unwind_info *ui, 
unwind_entry_cb_t cb,
if (ret)
display_error(ret);
 
-   while (!ret && (unw_step() > 0)) {
+   while (!ret && (unw_step() > 0) && max_stack--) {
unw_word_t ip;
 
unw_get_reg(, UNW_REG_IP, );
@@ -588,7 +588,8 @@ static int get_entries(struct unwind_info *ui, 
unwind_entry_cb_t cb,
 
 int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
struct machine *machine, struct thread *thread,
-   u64 sample_uregs, struct perf_sample *data)
+   u64 sample_uregs, struct perf_sample *data,
+   int max_stack)
 {
unw_word_t ip;
struct unwind_info ui = {
@@ -610,5 +611,5 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
if (ret)
return -ENOMEM;
 
-   return get_entries(, cb, arg);
+   return get_entries(, cb, arg, max_stack);
 }
diff --git a/tools/perf/util/unwind.h b/tools/perf/util/unwind.h
index ec0c71a..d5966f49 100644
--- a/tools/perf/util/unwind.h
+++ b/tools/perf/util/unwind.h
@@ -18,7 +18,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
struct machine *machine,
struct thread *thread,
u64 sample_uregs,
-   struct perf_sample *data);
+   struct perf_sample *data, int max_stack);
 int unwind__arch_reg_id(int regnum);
 #else
 static inline int
@@ -27,7 +27,8 @@ unwind__get_entries(unwind_entry_cb_t cb __maybe_unused,
struct machine *machine __maybe_unused,
struct thread *thread __maybe_unused,
u64 sample_uregs __maybe_unused,
-   struct perf_sample *data __maybe_unused)
+   struct perf_sample *data __maybe_unused,
+   int max_stack __maybe_unused)
 {
return 0;
 }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:perf/urgent] perf top: Add missing newline if the 'uid' is invalid

2013-11-14 Thread tip-bot for Ingo Molnar

Commit-ID:  ea432a8bb940e6bea2aaeca3c0ff3d931ad81f2e
Gitweb: http://git.kernel.org/tip/ea432a8bb940e6bea2aaeca3c0ff3d931ad81f2e
Author: Ingo Molnar 
AuthorDate: Wed, 13 Nov 2013 00:26:09 +0100
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 14 Nov 2013 15:59:50 -0300

perf top: Add missing newline if the 'uid' is invalid

Add missing newline if the 'uid' is invalid:

  hubble:~> perf top --stdio -u help
  Error:
  Invalid User: helphubble:~>

Fixed by this patch:

  comet:~/tip/tools/perf> perf top --stdio -u help
  Error:
  Invalid User: help
  comet:~/tip/tools/perf>

Signed-off-by: Ingo Molnar 
Cc: Frederic Weisbecker 
Cc: Adrian Hunter 
Cc: David Ahern 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Link: http://lkml.kernel.org/r/20131112232609.ga31...@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/builtin-top.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index b8f8e29..71e6402 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1172,7 +1172,7 @@ int cmd_top(int argc, const char **argv, const char 
*prefix __maybe_unused)
status = target__validate(target);
if (status) {
target__strerror(target, status, errbuf, BUFSIZ);
-   ui__warning("%s", errbuf);
+   ui__warning("%s\n", errbuf);
}
 
status = target__parse_uid(target);
@@ -1180,7 +1180,7 @@ int cmd_top(int argc, const char **argv, const char 
*prefix __maybe_unused)
int saved_errno = errno;
 
target__strerror(target, status, errbuf, BUFSIZ);
-   ui__error("%s", errbuf);
+   ui__error("%s\n", errbuf);
 
status = -saved_errno;
goto out_delete_evlist;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:perf/urgent] perf tools: Use perf_evlist__{first,last}, perf_evsel__next

2013-11-14 Thread tip-bot for Arnaldo Carvalho de Melo

Commit-ID:  9a354cdc2f40344a177d369fb4987a8270dd94df
Gitweb: http://git.kernel.org/tip/9a354cdc2f40344a177d369fb4987a8270dd94df
Author: Arnaldo Carvalho de Melo 
AuthorDate: Wed, 13 Nov 2013 15:54:30 -0300
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 14 Nov 2013 16:00:10 -0300

perf tools: Use perf_evlist__{first,last}, perf_evsel__next

In a few remaining places where the equivalent open coded variant was
still being used.

Cc: Adrian Hunter 
Cc: David Ahern 
Cc: Frederic Weisbecker 
Cc: Jiri Olsa 
Cc: Mike Galbraith 
Cc: Namhyung Kim 
Cc: Paul Mackerras 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Link: http://lkml.kernel.org/n/tip-4vjnloi5fisilykwxalb5...@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/tests/parse-events.c | 3 +--
 tools/perf/ui/browsers/hists.c  | 9 -
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index ef671cd..3cbd104 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -441,9 +441,8 @@ static int test__checkevent_pmu_name(struct perf_evlist 
*evlist)
 
 static int test__checkevent_pmu_events(struct perf_evlist *evlist)
 {
-   struct perf_evsel *evsel;
+   struct perf_evsel *evsel = perf_evlist__first(evlist);
 
-   evsel = list_entry(evlist->entries.next, struct perf_evsel, node);
TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->nr_entries);
TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->attr.type);
TEST_ASSERT_VAL("wrong exclude_user",
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 16848bb..089fd37 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1847,13 +1847,13 @@ browse_hists:
switch (key) {
case K_TAB:
if (pos->node.next == >entries)
-   pos = list_entry(evlist->entries.next, 
struct perf_evsel, node);
+   pos = perf_evlist__first(evlist);
else
-   pos = list_entry(pos->node.next, struct 
perf_evsel, node);
+   pos = perf_evsel__next(pos);
goto browse_hists;
case K_UNTAB:
if (pos->node.prev == >entries)
-   pos = list_entry(evlist->entries.prev, 
struct perf_evsel, node);
+   pos = perf_evlist__last(evlist);
else
pos = list_entry(pos->node.prev, struct 
perf_evsel, node);
goto browse_hists;
@@ -1943,8 +1943,7 @@ int perf_evlist__tui_browse_hists(struct perf_evlist 
*evlist, const char *help,
 
 single_entry:
if (nr_entries == 1) {
-   struct perf_evsel *first = list_entry(evlist->entries.next,
- struct perf_evsel, node);
+   struct perf_evsel *first = perf_evlist__first(evlist);
const char *ev_name = perf_evsel__name(first);
 
return perf_evsel__hists_browse(first, nr_entries, help,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:perf/urgent] perf probe: Add '--demangle'/'--no-demangle'

2013-11-14 Thread tip-bot for Azat Khuzhin

Commit-ID:  35e17b2450e09968f9702d4048c228199af171bc
Gitweb: http://git.kernel.org/tip/35e17b2450e09968f9702d4048c228199af171bc
Author: Azat Khuzhin 
AuthorDate: Mon, 28 Oct 2013 12:04:24 +0400
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 14 Nov 2013 16:06:28 -0300

perf probe: Add '--demangle'/'--no-demangle'

You can't pass demangled name into "perf probe", because of special chars:
./perf probe -f -x /tmp/a.out 'foo(int)'
Semantic error :There is non-digit char in line number.

And you can't even pass without demangling (because it search symbol in
DSO with demangle=true):
./perf probe -f -x /tmp/a.out _Z3fooi
no symbols found in /tmp/a.out, maybe install a debug package?

However:
nm /tmp/a.out | grep foo
0040056d T _Z3fooi

After this patch, using the next command:
./perf probe -f --no-demangle -x /tmp/a.out _Z3fooi

probe will be successfully added.

Signed-off-by: Azat Khuzhin 
Cc: Ingo Molnar 
Cc: Paul Mackerras 
Cc: Peter Zijlstra 
Link: 
http://lkml.kernel.org/r/1382947464-31266-1-git-send-email-a3at.m...@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/builtin-probe.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index 89acc17..6ea9e85 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -325,6 +325,8 @@ int cmd_probe(int argc, const char **argv, const char 
*prefix __maybe_unused)
 opt_set_filter),
OPT_CALLBACK('x', "exec", NULL, "executable|path",
"target executable name or path", opt_set_target),
+   OPT_BOOLEAN(0, "demangle", _conf.demangle,
+   "Disable symbol demangling"),
OPT_END()
};
int ret;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:perf/urgent] perf evsel: Introduce perf_evsel__prev() method

2013-11-14 Thread tip-bot for Arnaldo Carvalho de Melo

Commit-ID:  d87fcb4a2d990ba2de9284ede84a816c5066d54b
Gitweb: http://git.kernel.org/tip/d87fcb4a2d990ba2de9284ede84a816c5066d54b
Author: Arnaldo Carvalho de Melo 
AuthorDate: Wed, 13 Nov 2013 15:56:40 -0300
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 14 Nov 2013 16:00:16 -0300

perf evsel: Introduce perf_evsel__prev() method

Just one use so far, on the hists browser, for completeness since there
we use perf_evlist__{first,last} and perf_evsel__next() for handling the
TAB and UNTAB keys.

Cc: Adrian Hunter 
Cc: David Ahern 
Cc: Frederic Weisbecker 
Cc: Jiri Olsa 
Cc: Mike Galbraith 
Cc: Namhyung Kim 
Cc: Paul Mackerras 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Link: http://lkml.kernel.org/n/tip-d09l4lejp5427enuf3igp...@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/ui/browsers/hists.c | 2 +-
 tools/perf/util/evsel.h| 5 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 089fd37..a440e03 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1855,7 +1855,7 @@ browse_hists:
if (pos->node.prev == >entries)
pos = perf_evlist__last(evlist);
else
-   pos = list_entry(pos->node.prev, struct 
perf_evsel, node);
+   pos = perf_evsel__prev(pos);
goto browse_hists;
case K_ESC:
if (!ui_browser__dialog_yesno(>b,
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index f502965..1ea7c92 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -279,6 +279,11 @@ static inline struct perf_evsel *perf_evsel__next(struct 
perf_evsel *evsel)
return list_entry(evsel->node.next, struct perf_evsel, node);
 }
 
+static inline struct perf_evsel *perf_evsel__prev(struct perf_evsel *evsel)
+{
+   return list_entry(evsel->node.prev, struct perf_evsel, node);
+}
+
 /**
  * perf_evsel__is_group_leader - Return whether given evsel is a leader event
  *
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:perf/urgent] perf tools: Synthesize anon MMAP records again

2013-11-14 Thread tip-bot for Don Zickus

Commit-ID:  9d4ecc8893832337daf241236841db966fa53489
Gitweb: http://git.kernel.org/tip/9d4ecc8893832337daf241236841db966fa53489
Author: Don Zickus 
AuthorDate: Wed, 13 Nov 2013 15:32:06 -0300
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 14 Nov 2013 16:00:01 -0300

perf tools: Synthesize anon MMAP records again

When introducing the PERF_RECORD_MMAP2 in:

5c5e854bc760 perf tools: Add attr->mmap2 support

A check for the number of entries parsed by sscanf was introduced that
assumed all of the 8 fields needed to be correctly parsed so that
particular /proc/pid/maps line would be considered synthesizable.

That broke anon records synthesizing, as it doesn't have the 'execname'
field.

Fix it by keeping the sscanf return check, changing it to not require
that the 'execname' variable be parsed, so that the preexisting logic
can kick in and set it to '//anon'.

This should get things like JIT profiling working again.

Signed-off-by: Don Zickus 
Cc: Bill Gray 
Cc: Jiri Olsa 
Cc: Joe Mario 
Cc: Richard Fowles 
Cc: Stephane Eranian 
Cc: sta...@vger.kernel.org
Link: http://lkml.kernel.org/n/tip-bo4akalno7579shpz29u8...@git.kernel.org
[ commit log message is mine, dzickus reported the problem with a patch ]
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/event.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 6e3a846..bb788c1 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -209,8 +209,10 @@ static int perf_event__synthesize_mmap_events(struct 
perf_tool *tool,
   >mmap.start, >mmap.len, prot,
   >mmap.pgoff,
   execname);
-
-   if (n != 5)
+   /*
+* Anon maps don't have the execname.
+*/
+   if (n < 4)
continue;
/*
 * Just like the kernel, see __perf_event_mmap in 
kernel/perf_event.c
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:perf/urgent] perf tools: Remove trivial extra semincolon

2013-11-14 Thread tip-bot for Davidlohr Bueso

Commit-ID:  b13936ef7d48908be2fab7639dd535c88045
Gitweb: http://git.kernel.org/tip/b13936ef7d48908be2fab7639dd535c88045
Author: Davidlohr Bueso 
AuthorDate: Tue, 12 Nov 2013 22:24:24 -0800
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 14 Nov 2013 15:59:38 -0300

perf tools: Remove trivial extra semincolon

Accidentally ran into these, get rid of them.

Signed-off-by: Davidlohr Bueso 
Link: 
http://lkml.kernel.org/r/1384323864.2527.8.ca...@buesod1.americas.hpqcorp.net
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/ui/browser.c  | 2 +-
 tools/perf/util/evlist.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c
index bbc782e..3648d4e 100644
--- a/tools/perf/ui/browser.c
+++ b/tools/perf/ui/browser.c
@@ -680,7 +680,7 @@ static void __ui_browser__line_arrow_down(struct ui_browser 
*browser,
if (end >= browser->top_idx + browser->height)
end_row = browser->height - 1;
else
-   end_row = end - browser->top_idx;;
+   end_row = end - browser->top_idx;
 
ui_browser__gotorc(browser, row, column);
SLsmg_draw_vline(end_row - row + 1);
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index dc6fa3f..5ce2ace 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1148,7 +1148,7 @@ size_t perf_evlist__fprintf(struct perf_evlist *evlist, 
FILE *fp)
   perf_evsel__name(evsel));
}
 
-   return printed + fprintf(fp, "\n");;
+   return printed + fprintf(fp, "\n");
 }
 
 int perf_evlist__strerror_tp(struct perf_evlist *evlist __maybe_unused,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:perf/urgent] perf trace: Tweak summary output

2013-11-14 Thread tip-bot for Pekka Enberg

Commit-ID:  27a778b512e002d856952b4f01842ba4d34bc3d1
Gitweb: http://git.kernel.org/tip/27a778b512e002d856952b4f01842ba4d34bc3d1
Author: Pekka Enberg 
AuthorDate: Wed, 13 Nov 2013 14:21:48 +0200
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 14 Nov 2013 15:59:20 -0300

perf trace: Tweak summary output

Tweak the summary output as suggested by Ingo Molnar:

  [penberg@localhost ~]$ perf trace -a --duration 1 --summary -- sleep 1
  ^C
   Summary of events:

   Xorg (817), 148 events, 0.0%, 0.000 msec

 syscallcalls  min   avg   max  stddev
 (msec)(msec)(msec)(%)
 ---  - - - --
 read   7 0.002 0.004 0.011 32.00%
 rt_sigprocmask40 0.001 0.001 0.002  1.31%
 ioctl  6 0.002 0.003 0.005 19.45%
 writev 7 0.004 0.018 0.059 43.76%
 select 9 0.00074.513   507.869 74.61%
 setitimer  4 0.001 0.002 0.002 10.08%

Suggested-by: Ingo Molnar 
Signed-off-by: Pekka Enberg 
Acked-by: Ingo Molnar 
Cc: David Ahern 
Cc: Ingo Molnar 
Link: 
http://lkml.kernel.org/r/1384345308-24404-1-git-send-email-penb...@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/builtin-trace.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 6b230af..8be17fc 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2112,9 +2112,9 @@ static size_t thread__dump_stats(struct thread_trace 
*ttrace,
 
printed += fprintf(fp, "\n");
 
-   printed += fprintf(fp, "
msec/call\n");
-   printed += fprintf(fp, "   syscallcalls  min  avg   
   max stddev\n");
-   printed += fprintf(fp, "   ---    
 --\n");
+   printed += fprintf(fp, "   syscallcalls  min   avg  
 max  stddev\n");
+   printed += fprintf(fp, "   (msec)(msec) 
   (msec)(%%)\n");
+   printed += fprintf(fp, "   ---  - - 
- --\n");
 
/* each int_node is a syscall */
while (inode) {
@@ -2131,9 +2131,9 @@ static size_t thread__dump_stats(struct thread_trace 
*ttrace,
 
sc = >syscalls.table[inode->i];
printed += fprintf(fp, "   %-15s", sc->name);
-   printed += fprintf(fp, " %8" PRIu64 " %8.3f %8.3f",
+   printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f",
   n, min, avg);
-   printed += fprintf(fp, " %8.3f %6.2f\n", max, pct);
+   printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
}
 
inode = intlist__next(inode);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 04/15] KVM: MMU: flush tlb out of mmu lock when write-protect the sptes

2013-11-14 Thread Xiao Guangrong

On 11/15/2013 02:39 AM, Marcelo Tosatti wrote:
> On Thu, Nov 14, 2013 at 01:15:24PM +0800, Xiao Guangrong wrote:
>>
>> Hi Marcelo,
>>
>> On 11/14/2013 08:36 AM, Marcelo Tosatti wrote:
>>
>>>
>>> Any code location which reads the writable bit in the spte and assumes if 
>>> its not
>>> set, that the translation which the spte refers to is not cached in a
>>> remote CPU's TLB can become buggy. (*)
>>>
>>> It might be the case that now its not an issue, but its so subtle that
>>> it should be improved.
>>>
>>> Can you add a fat comment on top of is_writeable_bit describing this?
>>> (and explain why is_writable_pte users do not make an assumption
>>> about (*). 
>>>
>>> "Writeable bit of locklessly modifiable sptes might be cleared
>>> but TLBs not flushed: so whenever reading locklessly modifiable sptes
>>> you cannot assume TLBs are flushed".
>>>
>>> For example this one is unclear:
>>>
>>> if (!can_unsync && is_writable_pte(*sptep))
>>> goto set_pte;
>>> And:
>>>
>>> if (!is_writable_pte(spte) &&
>>>   !(pt_protect && spte_is_locklessly_modifiable(spte)))
>>> return false;
>>>
>>> This is safe because get_dirty_log/kvm_mmu_slot_remove_write_access are
>>> serialized by a single mutex (if there were two mutexes, it would not be
>>> safe). Can you add an assert to both
>>> kvm_mmu_slot_remove_write_access/kvm_vm_ioctl_get_dirty_log 
>>> for (slots_lock) is locked, and explain?
>>>
>>> So just improve the comments please, thanks (no need to resend whole
>>> series).
>>
>> Thank you very much for your time to review it and really appreciate
>> for you detailed the issue so clearly to me.
>>
>> I will do it on the top of this patchset or after it is merged
>> (if it's possiable).
> 
> Ok, can you explain why every individual caller of is_writable_pte have
> no such assumption now? (the one mentioned above is not clear to me for
> example, should explain all of them).

Okay.

Generally speak, we 1) needn't care readonly spte too much since it
can not be locklessly write-protected and 2) if is_writable_pte() is used
to check mmu-mode's state we can check SPTE_MMU_WRITEABLE instead.

There are the places is_writable_pte is used:
1) in spte_has_volatile_bits():
 527 static bool spte_has_volatile_bits(u64 spte)
 528 {
 529 /*
 530  * Always atomicly update spte if it can be updated
 531  * out of mmu-lock, it can ensure dirty bit is not lost,
 532  * also, it can help us to get a stable is_writable_pte()
 533  * to ensure tlb flush is not missed.
 534  */
 535 if (spte_is_locklessly_modifiable(spte))
 536 return true;
 537
 538 if (!shadow_accessed_mask)
 539 return false;
 540
 541 if (!is_shadow_present_pte(spte))
 542 return false;
 543
 544 if ((spte & shadow_accessed_mask) &&
 545   (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
 546 return false;
 547
 548 return true;
 549 }

this path is not broken since any spte can be lockless modifiable will do
lockless update (will always return 'true' in  the line 536).

2): in mmu_spte_update()
594 /*
 595  * For the spte updated out of mmu-lock is safe, since
 596  * we always atomicly update it, see the comments in
 597  * spte_has_volatile_bits().
 598  */
 599 if (spte_is_locklessly_modifiable(old_spte) &&
 600   !is_writable_pte(new_spte))
 601 ret = true;

The new_spte is a temp value that can not be fetched by lockless
write-protection and !is_writable_pte() is stable enough (can not be
locklessly write-protected).

3) in spte_write_protect()
1368 if (!is_writable_pte(spte) &&
1369   !spte_is_locklessly_modifiable(spte))
1370 return false;
1371

It always do write-protection if the spte is lockelss modifiable.
(This code is the aspect after applying the whole pachset, the code is safe too
before patch "[PATCH v3 14/15] KVM: MMU: clean up spte_write_protect" since
the lockless write-protection path is serialized by a single lock.).

4) in set_spte()
2690 /*
2691  * Optimization: for pte sync, if spte was writable the 
hash
2692  * lookup is unnecessary (and expensive). Write protection
2693  * is responsibility of mmu_get_page / kvm_sync_page.
2694  * Same reasoning can be applied to dirty page accounting.
2695  */
2696 if (!can_unsync && is_writable_pte(*sptep))
2697 goto set_pte;

It is used for a optimization and the worst case is the optimization is disabled
(walking the shadow pages in the hast table) when the spte has been locklessly
write-protected. It does not hurt anything since it is a rare event. And the
optimization can be back if we check SPTE_MMU_WRITEABLE

Re: [PATCH] perf kvm record: Change the default value of perf_guest to 0.

2013-11-14 Thread Dongsheng Yang


Hi David:

On 11/14/2013 03:38 PM, David Ahern wrote:

On 11/14/13, 2:51 PM, Dongsheng Yang wrote:
Currently, we can not record the event counters of host to 
perf.data.host.

Example:
perf kvm --host record -a sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.625 MB perf.data.guest (~27290 
samples) ]


perf kvm by definition wants --guest enabled. Host profiling is done 
with just 'perf record'; no need for the kvm layer at all. If you 
really want to go through perf-kvm to record host events disable guest 
actions:


$ perf kvm --host --no-guest record -a sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.029 MB perf.data.host (~1255 
samples) ]



David


Thanx for your reply ! It makes sense.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Fwd: A problem about IO scheduler in kernel

2013-11-14 Thread 韩磊

-- Forwarded message --
From: 韩磊 
Date: 2013/11/15
Subject: A problem about IO scheduler in kernel
To: Linux Kernel Mailing List 


These days I was programming about IO scheduler called
"Simple_Deadline" in kernel.
"Simple_Deadline" is based on "deadline-iosched".The algorithm is very
simple.It have two lists,one is read,the other is write.A request
enter lists based its weight which count accord to the request's
size,read or write. And when dispatch a request just compare the
weight between read list and write list,the smaller one dispatches.

When I modprobe this module and run it,if  a bit of IO come,it works
well.But when runs a large number IO,the system will crash.  Can you
help me to find the problem? I am so sad and helpless about it.

When system crashed,the screen display some information:

Message from syslogd@han at Nov 15 13:12:03 ...
 kernel:[ cut here ]

Message from syslogd@han at Nov 15 13:12:03 ...
 kernel:invalid opcode:  [#1] SMP

Message from syslogd@han at Nov 15 13:12:03 ...
 kernel:last sysfs file:
/sys/devices/pci:00/:00:04.0/:05:00.0/host0/port-0:0/end_device-0:0/target0:0:0/0:0:0:0/block/sda/queue/scheduler

Message from syslogd@han at Nov 15 13:12:03 ...
 kernel:Stack:

Message from syslogd@han at Nov 15 13:12:03 ...
 kernel:Call Trace:

Message from syslogd@han at Nov 15 13:12:03 ...
 kernel:Code: 2a 48 89 fe 4c 89 e7 e8 35 ff 01 00 48 8b 83 80 00 00 00
83 e0 03 4c 09 e0 4c 8b 64 24 08 48 89 83 80 00 00 00 48 8b 1c 24 c9
c3 <0f> 0b eb fe 66 90 55 48 89 e5 0f 1f 44 00 00 45 31 c0 48 89 f9


The code in accessory!  Please help find the bugs!  Thank you!
/*
 *  Deadline i/o scheduler.
 *
 *  Copyright (C) 2002 Jens Axboe 
 */
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

static const int read_expire = HZ / 2; 
static const int write_expire = 5 * HZ; 
static const int writes_starved = 2;
static const int fifo_batch = 16;  




#define DELTA 1

static const int read_time_per_byte = 30 ;  
static const int write_time_per_byte = 30  ; 


static const int read_time_once_transfer = 30 ; 
static const int write_time_once_transfer = 300 ; 

struct req_list_head
{
	struct request *current_req;
	unsigned long last_jiffies;
	long current_weight;
	struct req_list_head *next;
	struct req_list_head *prev;
};

struct deadline_data {

	struct req_list_head req_weight_list[2];
	struct request *next_rq[2];
	struct rb_root sort_list[2];	
	struct list_head fifo_list[2];
	unsigned int batching;		/* number of sequential requests made */  
	sector_t last_sector;		/* head position */
	unsigned int starved;		/* times reads have starved writes */
	int fifo_expire[2]; 
	int fifo_batch;
	int writes_starved;
	int front_merges;

};
void req_list_remove_request(struct req_list_head *list_head);
struct request *select_req_from_weight_list(struct req_list_head *req_write_list,struct req_list_head *req_read_list);
int req_list_empty(struct req_list_head *list_head);
void req_list_add(struct req_list_head *new_req,struct req_list_head *list_head);
void update_req_list_weight(struct req_list_head *list_head);
long req_list_count_weight(int data_dir,unsigned int bio_size);  
void Init_req_list_head(struct req_list_head *req_weight_list);
static void deadline_move_request(struct deadline_data *, struct request *);

static inline struct rb_root *
deadline_rb_root(struct deadline_data *dd, struct request *rq)
{
	return >sort_list[rq_data_dir(rq)];
}

static inline struct request *
deadline_latter_request(struct request *rq)
{
	struct rb_node *node = rb_next(>rb_node);

	if (node)
		return rb_entry_rq(node);

	return NULL;
}

static void
deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
{
	struct rb_root *root = deadline_rb_root(dd, rq);
	struct request *__alias;

	while (unlikely(__alias = elv_rb_add(root, rq)))
		deadline_move_request(dd, __alias);
}

static inline void
deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
{
	const int data_dir = rq_data_dir(rq);

	if (dd->next_rq[data_dir] == rq)
		dd->next_rq[data_dir] = deadline_latter_request(rq);

	elv_rb_del(deadline_rb_root(dd, rq), rq);
}

/*
 * add rq to rbtree and fifo
 */
static void   
deadline_add_request(struct request_queue *q, struct request *rq)
{
	printk("enter merge_add_request\n");
	struct deadline_data *dd = q->elevator->elevator_data;
	const int data_dir = rq_data_dir(rq);
	
	unsigned int bio_size=rq->__data_len;


	struct req_list_head *req_list=(struct req_list_head *)kmalloc(sizeof(*req_list),GFP_KERNEL);
	req_list->current_req=rq;
	req_list->current_weight=req_list_count_weight(data_dir,bio_size);
	req_list->last_jiffies=jiffies;

Re: [PATCH] PCI: Init NumVFs register to zero in sriov_init()

2013-11-14 Thread Yinghai Lu

On Wed, Nov 13, 2013 at 5:57 PM, Ethan Zhao  wrote:
> On Wed, Nov 6, 2013 at 10:49 PM, ethan.zhao  wrote:
>> Though no specification about NumVFs register initial value after POST, to 
>> void the confusion
>> lspci output as following before VF was enabled, we should clear the NumVFs 
>> value left by BIOS
>> to zero:

Does BIOS need to clear it?


>>
>> $lspci -vvv -s 03:00.0
>> Ethernet controller: Intel Corporation 82599EB 10-Gigabit SFI/SFP+ Network 
>> Connection (rev 01)
>> ~
>> Capabilities: [160 v1] Single Root I/O Virtualization (SR-IOV)
>> IOVCap: Migration-, Interrupt Message Number: 000
>> IOVCtl: Enable+ Migration- Interrupt- MSE+ ARIHierarchy+
>> IOVSta: Migration-
>> Initial VFs: 64, Total VFs: 64, Number of VFs: 64, Function 
>> Dependency Link: 00
>>   ^dazed !
>> ~

just display problem?

>> Signed-off-by: ethan.zhao 
>> ---
>>  drivers/pci/iov.c | 2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
>> index de8ffac..a4941ad 100644
>> --- a/drivers/pci/iov.c
>> +++ b/drivers/pci/iov.c
>> @@ -439,6 +439,8 @@ static int sriov_init(struct pci_dev *dev, int pos)
>>
>>  found:
>> pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
>> +   /* VF Enable is cleared, so we could init the NumVFs register to 0 */
>> +   pci_write_config_word(dev, pos + PCI_SRIOV_NUM_VF, 0);
>> pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, );
>> pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, );
>> if (!offset || (total > 1 && !stride))

Thanks

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/3] Early use of boot service memory

2013-11-14 Thread H. Peter Anvin

On 11/14/2013 10:55 PM, Yinghai Lu wrote:
> 
> Why just asking distros to append ",high" in their installation
> program for 64bit by default?
> 
[...]
> 
> What is hpa's suggestion?
> 

Pretty much what you just said ;)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/3] Early use of boot service memory

2013-11-14 Thread Yinghai Lu

On Thu, Nov 14, 2013 at 10:24 PM, Ingo Molnar  wrote:
>
> * jerry.hoem...@hp.com  wrote:
>
>> On Thu, Nov 14, 2013 at 08:44:04PM +0200, Pekka Enberg wrote:
>> > On Thu, Nov 14, 2013 at 8:04 PM,   wrote:
>> > > Making this issue a quirk will be a lot more practical.  Its a small, 
>> > > focused
>> > > change whose implications are limited and more easily understood.
>> >
>> > There's nothing practical with requiring users to pass a kernel option
>> > to make kdump work.  It's a workaround, sure, but it's not a proper
>> > fix.
>>
>> One already has to specify command line arguments to enable kdump.
>> See "crashkernel=" in Documentation/kernel-parameters.txt.
>
>> As i said in an earlier mail we are working w/ distros. [...]

Why just asking distros to append ",high" in their installation
program for 64bit by default?

If they don't want to do that, you can add instruction in your product notes, to
ask user/admin to add that if kdump fails.

>
>
>> As i said in earlier mail, i am willing to change implementation to
>> some type of black/white listing.
>
> Is it possible to fix it the way hpa suggested?
>

What is hpa's suggestion?

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2] ARM: dts: imx: specify the value of audmux pinctrl instead of 0x80000000

2013-11-14 Thread Nicolin Chen

On Fri, Nov 15, 2013 at 02:42:01PM +0800, Shawn Guo wrote:
> On Thu, Nov 14, 2013 at 07:07:09PM +0800, Nicolin Chen wrote:
> > We must specify the value of audmux pinctrl if we want to use pinctrl_pm().
> > Thus change bypass value 0x8000 to what we exactly need.
> > 
> > This patch also seperately unset PUE bit for TXD so that IOMUX won't pull
> > up/down the pin after turning into tristate. When we use SSI normal mode to
> > playback monaural audio via I2S signal, there'd be a pulled curve occur to
> > its signal at the second slot if setting PUE bit for TXD. And it will make
> > the second channel to play a constant noise. So by keeping the signal level
> > in the second slot, we can get a constant high level signal (-1) or a low
> > level one (0).
> > 
> > Signed-off-by: Nicolin Chen 
> > ---
> >  arch/arm/boot/dts/imx6qdl.dtsi | 22 +++---
> >  1 file changed, 11 insertions(+), 11 deletions(-)
> 
> We have moved all pin groups settings into
> arch/arm/boot/dts/imx6qdl-pingrp.h.  I just rebased and applied the
> patch.  Please check my imx/dt branch and ensure I applied the changes
> correctly.

Simply perfect. Thank you.
Nicolin

---


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] ARC: extable: Enable sorting at build time

2013-11-14 Thread Vineet Gupta

Avoids wasting cycles at boot specially on slower simulators

Signed-off-by: Vineet Gupta 
Cc: David Daney 
Cc: Michal Marek 
Cc: Francois Bedard 
Cc: linux-kernel@vger.kernel.org
---
 arch/arc/Kconfig  | 1 +
 scripts/sortextable.c | 5 +
 2 files changed, 6 insertions(+)

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 91dbb2757afd..080580216301 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -8,6 +8,7 @@
 
 config ARC
def_bool y
+   select BUILDTIME_EXTABLE_SORT
select CLONE_BACKWARDS
# ARC Busybox based initramfs absolutely relies on DEVTMPFS for /dev
select DEVTMPFS if !INITRAMFS_SOURCE=""
diff --git a/scripts/sortextable.c b/scripts/sortextable.c
index 7c2310c5b996..e3fb1c36ed0f 100644
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -31,6 +31,10 @@
 #include 
 #include 
 
+#ifndef EM_ARCOMPACT
+#define EM_ARCOMPACT   93
+#endif
+
 #ifndef EM_AARCH64
 #define EM_AARCH64 183
 #endif
@@ -244,6 +248,7 @@ do_file(char const *const fname)
case EM_S390:
custom_sort = sort_relative_table;
break;
+   case EM_ARCOMPACT:
case EM_ARM:
case EM_AARCH64:
case EM_MIPS:
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2] isp1704_charger: Add DT support

2013-11-14 Thread Pali Rohár

On Thursday 14 November 2013 23:34:19 Sebastian Reichel wrote:
> On Thu, Nov 14, 2013 at 07:05:06PM +0100, Pali Rohár wrote:
> > On Thursday 14 November 2013 13:38:53 Sebastian Reichel 
wrote:
> > > diff --git a/drivers/power/isp1704_charger.c
> > > b/drivers/power/isp1704_charger.c index fc04d19..db96778
> > > 100644
> > > --- a/drivers/power/isp1704_charger.c
> > > +++ b/drivers/power/isp1704_charger.c
> > > @@ -28,6 +28,8 @@
> > > 
> > >  #include 
> > >  #include 
> > >  #include 
> > > 
> > > +#include 
> > > +#include 
> > > 
> > >  #include 
> > >  #include 
> > > 
> > > @@ -89,8 +91,8 @@ static void
> > > isp1704_charger_set_power(struct isp1704_charger *isp,
> > > bool on) {
> > > 
> > >   struct isp1704_charger_data *board =
> > > 
> > > isp->dev->platform_data;
> > > 
> > > - if (board && board->set_power)
> > > - board->set_power(on);
> > > + if (board)
> > > + gpio_set_value(board->enable_gpio, on);
> > > 
> > >  }
> > 
> > You need to check if enable_gpio in board data was defined
> > or not.
> 
> The device is not successful probed without valid enable_gpio.
> 
> -- Sebastian

Then, OK.

-- 
Pali Rohár
pali.ro...@gmail.com


signature.asc
Description: This is a digitally signed message part.

Re: [PATCH 1/2] ARM: dts: imx: specify the value of audmux pinctrl instead of 0x80000000

2013-11-14 Thread Shawn Guo

On Thu, Nov 14, 2013 at 07:07:09PM +0800, Nicolin Chen wrote:
> We must specify the value of audmux pinctrl if we want to use pinctrl_pm().
> Thus change bypass value 0x8000 to what we exactly need.
> 
> This patch also seperately unset PUE bit for TXD so that IOMUX won't pull
> up/down the pin after turning into tristate. When we use SSI normal mode to
> playback monaural audio via I2S signal, there'd be a pulled curve occur to
> its signal at the second slot if setting PUE bit for TXD. And it will make
> the second channel to play a constant noise. So by keeping the signal level
> in the second slot, we can get a constant high level signal (-1) or a low
> level one (0).
> 
> Signed-off-by: Nicolin Chen 
> ---
>  arch/arm/boot/dts/imx6qdl.dtsi | 22 +++---
>  1 file changed, 11 insertions(+), 11 deletions(-)

We have moved all pin groups settings into
arch/arm/boot/dts/imx6qdl-pingrp.h.  I just rebased and applied the
patch.  Please check my imx/dt branch and ensure I applied the changes
correctly.

Shawn

> 
> diff --git a/arch/arm/boot/dts/imx6qdl.dtsi b/arch/arm/boot/dts/imx6qdl.dtsi
> index 6e096ca..6b76e55 100644
> --- a/arch/arm/boot/dts/imx6qdl.dtsi
> +++ b/arch/arm/boot/dts/imx6qdl.dtsi
> @@ -601,27 +601,27 @@
>   audmux {
>   pinctrl_audmux_1: audmux-1 {
>   fsl,pins = <
> - 
> MX6QDL_PAD_SD2_DAT0__AUD4_RXD  0x8000
> - 
> MX6QDL_PAD_SD2_DAT3__AUD4_TXC  0x8000
> - 
> MX6QDL_PAD_SD2_DAT2__AUD4_TXD  0x8000
> - 
> MX6QDL_PAD_SD2_DAT1__AUD4_TXFS 0x8000
> + 
> MX6QDL_PAD_SD2_DAT0__AUD4_RXD  0x130b0
> + 
> MX6QDL_PAD_SD2_DAT3__AUD4_TXC  0x130b0
> + 
> MX6QDL_PAD_SD2_DAT2__AUD4_TXD  0x110b0
> + 
> MX6QDL_PAD_SD2_DAT1__AUD4_TXFS 0x130b0
>   >;
>   };
>  
>   pinctrl_audmux_2: audmux-2 {
>   fsl,pins = <
> - 
> MX6QDL_PAD_CSI0_DAT7__AUD3_RXD  0x8000
> - 
> MX6QDL_PAD_CSI0_DAT4__AUD3_TXC  0x8000
> - 
> MX6QDL_PAD_CSI0_DAT5__AUD3_TXD  0x8000
> - 
> MX6QDL_PAD_CSI0_DAT6__AUD3_TXFS 0x8000
> + 
> MX6QDL_PAD_CSI0_DAT7__AUD3_RXD  0x130b0
> + 
> MX6QDL_PAD_CSI0_DAT4__AUD3_TXC  0x130b0
> + 
> MX6QDL_PAD_CSI0_DAT5__AUD3_TXD  0x110b0
> + 
> MX6QDL_PAD_CSI0_DAT6__AUD3_TXFS 0x130b0
>   >;
>   };
>  
>   pinctrl_audmux_3: audmux-3 {
>   fsl,pins = <
> - 
> MX6QDL_PAD_DISP0_DAT16__AUD5_TXC  0x8000
> - 
> MX6QDL_PAD_DISP0_DAT18__AUD5_TXFS 0x8000
> - 
> MX6QDL_PAD_DISP0_DAT19__AUD5_RXD  0x8000
> + 
> MX6QDL_PAD_DISP0_DAT16__AUD5_TXC  0x130b0
> + 
> MX6QDL_PAD_DISP0_DAT18__AUD5_TXFS 0x130b0
> + 
> MX6QDL_PAD_DISP0_DAT19__AUD5_RXD  0x130b0
>   >;
>   };
>   };
> -- 
> 1.8.4
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [GIT PULL 00/10] perf/core improvements and fixes

2013-11-14 Thread Ingo Molnar


* Arnaldo Carvalho de Melo  wrote:

> From: Arnaldo Carvalho de Melo 
> 
> Hi Ingo,
> 
>   Please consider pulling, done on top of tip/perf/urgent.
> 
> - Arnaldo
> 
> The following changes since commit e310718d0e83aeb9969264dc577c45db16d9104d:
> 
>   tools/perf/build: Fix feature-libunwind-debug-frame handling (2013-11-14 
> 18:00:45 +0100)
> 
> are available in the git repository at:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux 
> tags/perf-core-for-mingo
> 
> for you to fetch changes up to 539e6bb71e350541105e67e3d6c31392d9da25ef:
> 
>   perf record: Add an option to force per-cpu mmaps (2013-11-14 16:10:27 
> -0300)
> 
> 
> perf/core improvements and fixes:
> 
> . Synthesize anon MMAP records again, fix from Don Zickus.
> 
> . Add an option in 'perf record' to force per-cpu mmaps, from Adrian Hunter.
> 
> . Limit max callchain using max_stack on DWARF unwinding too.
> 
> . Fix segfault in the UI browser caused by off by one handling END key.
> 
> . Add '--demangle'/'--no-demangle' to perf probe, so that we can overcome
>   current limitations in handling C++ symbols, from Azat Khuzhin .
> 
> . Tweak 'perf trace' summary output, from Pekka Enberg.
> 
> Signed-off-by: Arnaldo Carvalho de Melo 
> 
> 
> Adrian Hunter (1):
>   perf record: Add an option to force per-cpu mmaps
> 
> Arnaldo Carvalho de Melo (4):
>   perf tools: Use perf_evlist__{first,last}, perf_evsel__next
>   perf evsel: Introduce perf_evsel__prev() method
>   perf symbols: Limit max callchain using max_stack on DWARF unwinding too
>   perf ui browser: Fix segfault caused by off by one handling END key
> 
> Azat Khuzhin (1):
>   perf probe: Add '--demangle'/'--no-demangle'
> 
> Davidlohr Bueso (1):
>   perf tools: Remove trivial extra semincolon
> 
> Don Zickus (1):
>   perf tools: Synthesize anon MMAP records again
> 
> Ingo Molnar (1):
>   perf top: Add missing newline if the 'uid' is invalid
> 
> Pekka Enberg (1):
>   perf trace: Tweak summary output
> 
>  tools/perf/Documentation/perf-record.txt |  6 ++
>  tools/perf/builtin-probe.c   |  2 ++
>  tools/perf/builtin-record.c  |  2 ++
>  tools/perf/builtin-top.c |  4 ++--
>  tools/perf/builtin-trace.c   | 10 +-
>  tools/perf/tests/parse-events.c  |  3 +--
>  tools/perf/ui/browser.c  |  4 ++--
>  tools/perf/ui/browsers/hists.c   | 11 +--
>  tools/perf/util/event.c  |  6 --
>  tools/perf/util/evlist.c |  6 --
>  tools/perf/util/evsel.c  |  4 ++--
>  tools/perf/util/evsel.h  |  5 +
>  tools/perf/util/machine.c|  2 +-
>  tools/perf/util/target.h |  1 +
>  tools/perf/util/unwind.c |  9 +
>  tools/perf/util/unwind.h |  5 +++--
>  16 files changed, 50 insertions(+), 30 deletions(-)

Pulled, thanks a lot Arnaldo!

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [BUG] perf stat: explicit grouping yields unexpected results

2013-11-14 Thread Ingo Molnar


* Stephane Eranian  wrote:

> Jiri,
> 
> I was trying the grouping support in perf stat and I was surprised 
> to see that if I create a group that is too big to be scheduled, and 
> where only N out of P events can fit, perf stat still yields counts 
> for the N events. I was expecting 0 counts or .
> 
> The kernel semantic is to schedule all the events in a group or 
> none. Perf does something different and this is confusing. If you 
> use explicit grouping then I think you want to group to fail if not 
> all the events can be scheduled:
> 
> On an IvyBridge:
> $ perf stat --g -e
> '{cycles,instructions,branches,branches,branches,branches,branches}'
> noploop 1
>  3 229 417 079 cycles
>  3 223 919 023 instructions  #1,00  insns per cycle
>  3 220 868 098 branches
>  3 220 868 098 branches
>  3 220 868 098 branches
>  3 220 868 098 branches
> branches
> 
> I think it should be:  for all events.

Btw., does the kernel side currently support discovery of such 
impossible group scheduling constraints at group setup time? If not 
then it probably should and it should reject them straight away.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

DEAL FOR DINOSILE OIL

2013-11-14 Thread Dr. Bright M Ben

Greetings,
 

 I am contacting you regarding our company urgent need for (DINOSILE OIL) a raw 
material I found in India recently.
 
It is the major liquid material our company has been using for the production 
of animal injections and vaccines and also for research since 2001. Our company 
is currently buying this product at the rate of $12,000USD per 1 gallon 5liters 
from a supplier in USA and Brazil.
 
Recently I found out that this same material is sold in India at the rate USD 
$4,000 per 1 gallon 5liters and I do not wish to let anyone in our company know 
about this because of my interest in the business. I intend to present you as a 
supplier of the material in India {you will be a Middleman between our company 
and the local vendor in India} so that my company will not know the main-source 
of the material.
 
This is just a kind of buying and selling. If you are willing to co-operate 
with me, I will send you my director's contact details then you will send him 
an offer and as soon as he indicate interest to buy, you will let him know the 
stock is ready and your are willing to supply at a reduced price per gallon. I 
will also forward you the contact details of the local dealer over there in 
India.
 
 
God Bless You
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: 3.10.16 cgroup_mutex deadlock

2013-11-14 Thread Tejun Heo

Hello,

On Thu, Nov 14, 2013 at 04:56:49PM -0600, Shawn Bohrer wrote:
> After running both concurrently on 40 machines for about 12 hours I've
> managed to reproduce the issue at least once, possibly more.  One
> machine looked identical to this reported issue.  It has a bunch of
> stuck cgroup_free_fn() kworker threads and one thread in cpuset_attach
> waiting on lru_add_drain_all().  A sysrq+l shows all CPUs are idle
> except for the one triggering the sysrq+l.  The sysrq+w unfortunately
> wrapped dmesg so we didn't get the stacks of all blocked tasks.  We
> did however also cat /proc//stack of all kworker threads on the
> system.  There were 265 kworker threads that all have the following
> stack:

Umm... so, WQ_DFL_ACTIVE is 256.  It's just an arbitrarily largish
number which is supposed to serve as protection against runaway
kworker creation.  The assumption there is that there won't be a
dependency chain which can be longer than that and if there are it
should be separated out into a separate workqueue.  It looks like we
*can* have such long chain of dependency with high enough rate of
cgroup destruction.  kworkers trying to destroy cgroups get blocked by
an earlier one which is holding cgroup_mutex.  If the blocked ones
completely consume max_active and then the earlier one tries to
perform an operation which makes use of the system_wq, the forward
progress guarantee gets broken.

So, yeah, it makes sense now.  We're just gonna have to separate out
cgroup destruction to a separate workqueue.  Hugh's temp fix achieved
about the same effect by putting the affected part of destruction to a
different workqueue.  I probably should have realized that we were
hitting max_active when I was told that moving some part to a
different workqueue makes the problem go away.

Will send out a patch soon.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/3] Early use of boot service memory

2013-11-14 Thread Ingo Molnar


* jerry.hoem...@hp.com  wrote:

> On Thu, Nov 14, 2013 at 08:44:04PM +0200, Pekka Enberg wrote:
> > On Thu, Nov 14, 2013 at 8:04 PM,   wrote:
> > > Making this issue a quirk will be a lot more practical.  Its a small, 
> > > focused
> > > change whose implications are limited and more easily understood.
> > 
> > There's nothing practical with requiring users to pass a kernel option
> > to make kdump work.  It's a workaround, sure, but it's not a proper
> > fix.
> 
> One already has to specify command line arguments to enable kdump. 
> See "crashkernel=" in Documentation/kernel-parameters.txt.

That option is already a usability barrier. Adding yet another 
usability barrier improves things how?

> As i said in an earlier mail we are working w/ distros. [...]

The point being?

> [...]  distros can and do specify lots of interesting command line 
> arguments for their systems.  Distros have tools for configuring 
> kdump. User must already use these tools or manually edit multiple 
> config files, to get kdump to work.  I would work with distros to 
> help integrate this change into their tools.

Here you describe a method that has already successfully cut the kdump 
user base to a fraction of its potential size. Why should we assist to 
that effort of engineered obscurity?

> As i said in earlier mail, i am willing to change implementation to 
> some type of black/white listing.

Is it possible to fix it the way hpa suggested?

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2] of: make of_get_phy_mode parse 'phy-connection-type'

2013-11-14 Thread Florian Fainelli

Per the ePAPR v1.1 specification, 'phy-connection-type' is the canonical
property name for describing an Ethernet to PHY connection type. Make
sure that of_get_phy_mode() also attempts to parse that property and
update the comments mentioning 'phy-mode' to also include
'phy-connection-type'.

Acked-by: Grant Likely 
Signed-off-by: Florian Fainelli 
---
Changes since v2:
- reworked the error condition to look nicer per Grant's suggestion
- added Grant's Acked-by tag
- fixed a typo in the commit message on "mentioning"

 drivers/of/of_net.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/of/of_net.c b/drivers/of/of_net.c
index 8f9be2e..651e249 100644
--- a/drivers/of/of_net.c
+++ b/drivers/of/of_net.c
@@ -13,8 +13,8 @@
 
 /**
  * It maps 'enum phy_interface_t' found in include/linux/phy.h
- * into the device tree binding of 'phy-mode', so that Ethernet
- * device driver can get phy interface from device tree.
+ * into the device tree binding of 'phy-mode' or 'phy-connection-type',
+ * so that Ethernet device driver can get phy interface from device tree.
  */
 static const char *phy_modes[] = {
[PHY_INTERFACE_MODE_NA] = "",
@@ -36,8 +36,9 @@ static const char *phy_modes[] = {
  * of_get_phy_mode - Get phy mode for given device_node
  * @np:Pointer to the given device_node
  *
- * The function gets phy interface string from property 'phy-mode',
- * and return its index in phy_modes table, or errno in error case.
+ * The function gets phy interface string from property 'phy-mode' or
+ * 'phy-connection-type', and return its index in phy_modes table, or errno in
+ * error case.
  */
 int of_get_phy_mode(struct device_node *np)
 {
@@ -46,6 +47,8 @@ int of_get_phy_mode(struct device_node *np)
 
err = of_property_read_string(np, "phy-mode", );
if (err < 0)
+   err = of_property_read_string(np, "phy-connection-type", );
+   if (err < 0)
return err;
 
for (i = 0; i < ARRAY_SIZE(phy_modes); i++)
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [REVIEW][PATCH 1/2] userns: Better restrictions on when proc and sysfs can be mounted

2013-11-14 Thread Gao feng

On 11/15/2013 12:54 PM, Eric W. Biederman wrote:
> Gao feng  writes:
> 
>> On 11/15/2013 12:54 AM, Andy Lutomirski wrote:
>>> On Thu, Nov 14, 2013 at 3:10 AM, Gao feng  wrote:
 On 11/13/2013 03:26 PM, Gao feng wrote:
> On 11/09/2013 01:42 PM, Eric W. Biederman wrote:
>> Right now I would rather not have the empty directory exception than
>> remove this code.
>>
>> The test is a little trickier to write than it might otherwise be
>> because /proc and /sys tend to be slightly imperfect filesystems.
>>
>> I think the only way to really test that is to call readdir on the
>> directory itself :(  I don't like that thought.
>>
>> I don't know what I was thinking when I wrote that test but I definitely
>> goofed up.  Grr!
>>
>> I can certainly filter out any directory with nlink > 2.  That would be
>> an easy partial step forward.
>>
>> The real question though is how do I detect directories it is safe to
>> mount on where there will not be files in them.  I can't call iterate
>> with the namespace_lock held so things are a bit tricky.
>>
>
> I know this problem is not easy to be resolved. why not let the user
> make the decision?  maybe we can introduce a new mount option MS_LOCK,
> if user wants to use mount to hide something, he should use mount with
> option MS_LOCK. so the unpriviged user can't umount this filesystem and
> fail to mount the filesystem if one of it's child mount is mounted with
> MS_LOCK option otherwise he use MS_REC too.
>

 Something like this.

 From 437f33ea366623c7a9d557b2e84cae424876a44f Mon Sep 17 00:00:00 2001
 From: Gao feng 
 Date: Wed, 13 Nov 2013 16:06:46 +0800
 Subject: [PATCH] userns: introduce new mount option MS_LOCK

 After commit 5ff9d8a65ce80efb509ce4e8051394e9ed2cd942
 vfs: Lock in place mounts from more privileged users,
 in userns, the mounts of child mntns which copied from
 parent mntns is locked and user has no rights to umount/move
 them, it's too strict.

 The core purpose of above commit is trying to prevent
 unprivileged user from accessing files hidden by mount.
 This patch introduces a new mount option MS_LOCK, this
 gives user the capable to mount filesystem as the type
 of lock if he wants to use mount to hide something.

>>>
>>> This is bad -- if something was secure in old kernels, it needs to
>>> stay secure.  If you had MS_NOT_A_LOCK, that would be okay, but it
>>> might not solve your problem.
>>>
>>
>> what you mean old kernels here? I saw patch "vfs: Lock in place mounts from 
>> more privileged users"
>> is merged into upstream in linux 3.12-rc1, this is not very old. I think 
>> there
>> are not many userspace processes rely on this feature.
> 
> Sort of true.  Most people aren't that silly.  This feature was added to
> defend against a theoretical attack that you can use with mount
> namespaces.
> 
> In particular the scenario we are concerned with is:
> 
> Suppose the file system looks like:
> 
> Suppose there are two filesystems a and b that look like:
> 
> a:/usr/
> a:/usr/my_very_secret_file
> a:/dev/
> a:/etc/
> a:/lib/
> 
> b:/bin/
> b:/etc/
> b:/games/
> b:/include/
> b:/lib/
> b:/lib32/
> b:/local/
> b:/sbin/
> b:/share/
> b:/src/
> 
> And filesystem b is mounted on a:/usr hiding a:/usr/my_very_secret_file
> 
> So the filesystem looks like:
> 
> /usr/
> /usr/bin/
> /usr/etc/
> /usr/games/
> /usr/include/
> /usr/lib/
> /usr/lib32/
> /usr/local/
> /usr/sbin/
> /usr/share/
> /usr/src/
> /dev/
> /etc/
> /lib/
> 
> Without locking mounts into place an unprivileged user can clone the
> mount namespace and do "umount /usr" and read /usr/my_very_secret_file.
> 
> Most systems don't hide sensitive things with mounts but it is very
> possible and guarding against is fairly cheap and easy.  And while a
> little annoying it should not be a large impediment to unprivileged user
> of the user namespace because pivot root still works.
> 
> This thread started talking about bugs in fs_fully_visible.  And those
> bugs are fixable and I aim to get to them shortly.  At the very least
> I can lie and test for nlink <= 2 which fixes the regression in mounting
> proc.
> 
> Then I can write the fun version that takes references and drops locks
> so it can call the internal version of readdir to see if a directory is
> actually empty.
> 
> But the principle remains the same we really don't want to reveal
> anything that is hidden under a mount on purpose or by mistake.  Just
> because then we don't have to think about those things from a security
> point of view making everyone's life easier.
> 

Ok,I agree with you that we should make container security by default.

What's your idea that introduces option MS_NOT_A_LOCK just like Andy's
advisement?

In libvirt, host creates dev and devpts directories for container,then
mount devpts, tmpfs on them and create device nodes

Re: [PATCH 10/10] perf record: Add an option to force per-cpu mmaps

2013-11-14 Thread Ingo Molnar

* Arnaldo Carvalho de Melo  wrote:

> +--force-per-cpu::
> + Force the use of per-cpu mmaps.  By default, when tasks are specified (i.e. 
> -p,
> + -t or -u options) per-thread mmaps are created.  This option overrides that 
> and
> + forces per-cpu mmaps.  A side-effect of that is that inheritance is
> + automatically enabled.  Add the -i option also to disable inheritance.

So I still haven't seen an explanation why it's called 'force' 
anything. AFAICS nothing is 'forced' really, this is simply another 
trace-ringbuffer setup method, right?

And I also raised why this shouldn't be the default event tracing 
method instead of a weird config option. Per-cpu tracing is cache 
compact, it is easier to size properly and in general it is pretty 
easy to think about. (It also has less of the TSC timestamp ordering 
problems as per thread tracing, at least in theory.)

Is there something that makes per cpu tracing undesirable as the 
default?

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] Cpufreq: Remove fossil comment in the cpufreq_governor_dbs()

2013-11-14 Thread Lan Tianyu

The related code has been changed and the comment is out of data.
So remove it.

Signed-off-by: Lan Tianyu 
---
 drivers/cpufreq/cpufreq_governor.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_governor.c 
b/drivers/cpufreq/cpufreq_governor.c
index 0806c31..e6be635 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -328,10 +328,6 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 dbs_data->cdata->gov_dbs_timer);
}
 
-   /*
-* conservative does not implement micro like ondemand
-* governor, thus we are bound to jiffes/HZ
-*/
if (dbs_data->cdata->governor == GOV_CONSERVATIVE) {
cs_dbs_info->down_skip = 0;
cs_dbs_info->enable = 1;
-- 
1.8.4.rc0.1.g8f6a3e5.dirty

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 10/10] perf tests: Use lower sample_freq in sw clock event period test

2013-11-14 Thread Namhyung Kim

Hi Arnaldo,

On Tue, 12 Nov 2013 11:41:01 -0300, Arnaldo Carvalho de Melo wrote:
> Em Tue, Nov 12, 2013 at 09:07:36AM +0200, Adrian Hunter escreveu:
>> -#define NR_LOOPS  100
>> +#define NR_LOOPS  1000
>
> Lower frequency, need to generate more noise, ugh. Adding that, but I
> think this test needs to be reworked, Namhyung?

Hmm.. We might go back to use the default frequency of 4000 (or 1000)
and make the loop based on time like using alarm or setitimer.

Or we can add an outer loop which doubles the inner loop counter if no
samples are collected.

What do you think?

Thanks,
Namhyung
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/1] Cpufreq: Make governor data on nonboot cpus across system syspend/resume

2013-11-14 Thread Lan Tianyu

Currently, governor of nonboot cpus will be put to EXIT when system syspend.
Since all these cpus will be unplugged and the governor usage_count decreases
to zero. The governor data and its sysfs interfaces will be freed or released.
This makes user config of these governors loss during suspend and resume.

This doesn't happen on the governor covering boot cpu because it isn't
unplugged during system suspend.

To fix this issue, skipping governor exit during system suspend and check
policy governor data to determine whether the governor is really needed
to be initialized when do init. If not, return EALREADY to indicate the
governor has been initialized and should do nothing. __cpufreq_governor()
convert EALREADY to 0 as return value for INIT event since governor is
still under INIT state and can do START operation.

Signed-off-by: Lan Tianyu 
---
 drivers/cpufreq/cpufreq.c  |  5 -
 drivers/cpufreq/cpufreq_governor.c | 13 -
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 02d534d..38f2e4a 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1239,7 +1239,7 @@ static int __cpufreq_remove_dev_finish(struct device *dev,
 
/* If cpu is last user of policy, free policy */
if (cpus == 1) {
-   if (has_target()) {
+   if (has_target() && !frozen) {
ret = __cpufreq_governor(policy,
CPUFREQ_GOV_POLICY_EXIT);
if (ret) {
@@ -1822,6 +1822,9 @@ static int __cpufreq_governor(struct cpufreq_policy 
*policy,
((event == CPUFREQ_GOV_POLICY_EXIT) && !ret))
module_put(policy->governor->owner);
 
+   if ((event == CPUFREQ_GOV_POLICY_INIT) && ret == -EALREADY)
+   ret = 0;
+
return ret;
 }
 
diff --git a/drivers/cpufreq/cpufreq_governor.c 
b/drivers/cpufreq/cpufreq_governor.c
index 0806c31..ddb93af 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -204,9 +204,20 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 
switch (event) {
case CPUFREQ_GOV_POLICY_INIT:
+   /*
+* In order to keep governor data across suspend/resume,
+* Governor doesn't exit when suspend and will be
+* reinitialized when resume. Here check policy governor
+* data to determine whether the governor has been exited.
+* If not, return EALREADY.
+*/
if (have_governor_per_policy()) {
-   WARN_ON(dbs_data);
+   if (dbs_data)
+   return -EALREADY;
} else if (dbs_data) {
+   if (policy->governor_data == dbs_data)
+   return -EALREADY;
+
dbs_data->usage_count++;
policy->governor_data = dbs_data;
return 0;
-- 
1.8.2.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] [RFC] initramfs: Prefix simple paths with $(srctree)

2013-11-14 Thread Vineet Gupta

On 11/15/2013 03:44 AM, Geert Uytterhoeven wrote:
> If CONFIG_INITRAMFS_SOURCE contains relative paths inside the source tree
> (e.g. in a defconfig pointing to arch-specific files), the corresponding
> file system entries are not found when building outside the source tree.
> 
> Prefix all simple paths (paths not starting with "/", "../", or "./") with
> $(srctree) to fix this.
> 
> Signed-off-by: Geert Uytterhoeven 
> ---
> This issue happens when building an OpenRISC defconfig from
> git://openrisc.net/jonas/linux. Mainline doesn't have the OpenRISC
> initramfs.

+1.
This applies to ARC defconfig too.

> 
> Questions:
>   1. Is this an acceptable solution for mainline?
>   2. My make-foo is limited. is there a better way to accomplish this, than
>  by prefixing all paths and removing the prefixes again where they're
>  not wanted?
>
>  usr/Makefile |6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/usr/Makefile b/usr/Makefile
> index e767f019accf..2170c38936ce 100644
> --- a/usr/Makefile
> +++ b/usr/Makefile
> @@ -41,6 +41,12 @@ hostprogs-y := gen_init_cpio
>  initramfs   := $(CONFIG_SHELL) $(srctree)/scripts/gen_initramfs_list.sh
>  ramfs-input := $(if $(filter-out "",$(CONFIG_INITRAMFS_SOURCE)), \
>   $(shell echo $(CONFIG_INITRAMFS_SOURCE)),-d)
> +ifneq ("$(ramfs-input)", "-d")
> +ramfs-input := $(patsubst %, $(srctree)/%, $(ramfs-input))
> +ramfs-input := $(patsubst $(srctree)//%, /%, $(ramfs-input))
> +ramfs-input := $(patsubst $(srctree)/../%, ../%, $(ramfs-input))
> +ramfs-input := $(patsubst $(srctree)/./%, ./%, $(ramfs-input))
> +endif
>  ramfs-args  := \
>  $(if $(CONFIG_INITRAMFS_ROOT_UID), -u $(CONFIG_INITRAMFS_ROOT_UID)) \
>  $(if $(CONFIG_INITRAMFS_ROOT_GID), -g $(CONFIG_INITRAMFS_ROOT_GID))
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

ARC cross compiler hosting on kernel.org

2013-11-14 Thread Vineet Gupta

Hi Tony,

Can you please host a ARC cross compiler on
https://www.kernel.org/pub/tools/crosstool

Mainline buildroot supports ARC (alas that build system doesn't generate a
relocatable toolchain).

Or you can simply use our homegrown build-system to checkout and build tools
(which can even do a relocatable toolchain provided you have patchelf installed)

git clone g...@github.com:foss-for-synopsys-dwc-arc-processors/toolchain.git
cd toolchain
./build-all.sh --auto-checkout --auto-pull --rel-rpaths --no-elf32 \
--target-cflags "-gdwarf-2"

ARC defconfig requires an INITRAMFS at a relative path. You can either disable 
it
(I have a patch queued up for a !RAMFS defconfig too).

Let me know if you run into any issues.

Thx,
-Vineet
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 05/40] staging/lustre: validate open handle cookies

2013-11-14 Thread Greg Kroah-Hartman

On Fri, Nov 15, 2013 at 12:13:07AM +0800, Peng Tao wrote:
> From: "John L. Hammond" 
> 
> Add a const void *h_owner member to struct portals_handle. Add a const
> void *owner parameter to class_handle2object() which must be matched
> by the h_owner member of the handle in addition to the cookie.

Ick ick ick.

NEVER use a void pointer if you can help it, and for a "handle", never.
This isn't other operating systems, sorry.  We know what types our
pointers to structures are, use them, so that the compiler can catch our
problems, and don't try to cheat by using void *.

> Adjust
> the callers of class_handle2object() accordingly, using NULL as the
> argument to the owner parameter, except in the case of
> mdt_handle2mfd() where we add an explicit mdt_export_data parameter
> which we use as the owner when searching for a MFD. When allocating a
> new MFD, pass a pointer to the mdt_export_data into mdt_mfd_new() and
> store it in h_owner. This allows the MDT to validate that the client
> has not sent the wrong open handle cookie, or sent the right cookie to
> the wrong MDT.

This changelog entry doesn't even match up with the code below.  ALl
callers of class_handle2object are passing NULL here, which makes this
patch pretty pointless, right?

And that's a _very_ generic global symbol name, please don't do that, it
needs to be "lustre_*" at the front to even expect it to be acceptable.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 00/40] staging/lustre: patch bomb 1

2013-11-14 Thread Greg Kroah-Hartman

On Fri, Nov 15, 2013 at 12:13:02AM +0800, Peng Tao wrote:
> [sadly git-send-email died sending the patchset... so resending]
> 
> Hi Greg,
> 
> Here are 40 patches. Three cleanup patches and 37 patches ported
> from Lustre master to commit
> LU-2940 llite: Fix for oops in vvp_pgcache_show()
> 
> There are two more patchsets that I will send out following up.
> 
> Thanks,
> Tao
> 
> Cc: Andreas Dilger 
> 
> Amir Shehata (2):
>   staging/lustre/lnet: Fix assert on empty group in selftest module
>   staging/lustre/ptlrpc: Fix a crash when dereferencing NULL pointer
> 
> Andreas Dilger (2):
>   staging/lustre/ldlm: fix resource/fid check, use DLDLMRES
>   staging/lustre/idl: remove LASSERT/CLASSERT from lustre_idl.h
> 
> Andrew Perepechko (1):
>   staging/lustre/llite: extended attribute cache
> 
> Andriy Skulysh (2):
>   staging/lustre/ptlrpc: Fix race during exp_flock_hash creation
>   staging/lustre/ptlrpc: flock deadlock detection does not work
> 
> Artem Blagodarenko (1):
>   staging/lustre/mgs: set_param -P option that sets value permanently
> 
> Dmitry Eremin (2):
>   staging/lustre/build: clean up unused variables and dead code
>   staging/lustre/build: fix compilation issue with is_compat_task
> 
> Doug Oucharek (1):
>   staging/lustre/lnet: Add LNet Router Priority parameter
> 
> Fan Yong (2):
>   staging/lustre/scrub: OI scrub on OST
>   staging/lustre/scrub: control OI scrub on OST from user space
> 
> JC Lafoucriere (5):
>   staging/lustre/llite: Access to released file trigs a restore
>   staging/lustre/mdt: HSM coordinator client interface
>   staging/lustre/mdt: HSM coordinator agent interface
>   staging/lustre/api: HSM import uses new released pattern
>   staging/lustre/utils: HSM Posix CopyTool
> 
> James Simmons (2):
>   staging/lustre/autoconf: remove vectored fops tests
>   staging/lustre/autoconf: remove LIBCFS_HAVE_IS_COMPAT_TASK test
> 
> Jinshan Xiong (2):
>   staging/lustre/hsm: Implementation of exclusive open
>   staging/lustre/hsm: Add hsm_release feature.
> 
> John L. Hammond (7):
>   staging/lustre: validate open handle cookies
>   staging/lustre/llite: use correct FID in ll_och_fill()
>   staging/lustre/lov: convert magic to host-endian in lov_dump_lmm()
>   staging/lustre/mdc: prevent fall through in mdc_iocontrol()
>   staging/lustre/lu: shrink lu_object by 8 bytes on x86_64
>   staging/lustre/llite: don't check for O_CREAT in it_create_mode
>   staging/lustre/llite: pass correct pointer to obd_iocontrol()
> 
> Lai Siyao (1):
>   staging/lustre/llite: remove ll_d_root_ops
> 
> Li Xi (1):
>   staging/lustre/llog: fix return value of llog_alloc_handle
> 
> Mikhail Pershin (4):
>   staging/lustre/server: use unified request handler for MGS
>   staging/lustre/llog: MGC to use OSD API for backup logs
>   staging/lustre/target: move OUT to the unified target code
>   staging/lustre/seq: unified SEQ handler
> 
> Nathaniel Clark (1):
>   staging/lustre/xattr: separate ACL and XATTR caches
> 
> Patrick Farrell (1):
>   staging/lustre/nfs: writing to new files will return ENOENT
> 
> Peng Tao (3):
>   staging/lustre/llite: restore ll_fiemap
>   staging/lustre: remove lu_target.h
>   staging/lustre: remove llog_server.c
> 
>  .../staging/lustre/include/linux/libcfs/curproc.h  |1 -
>  .../staging/lustre/include/linux/libcfs/libcfs.h   |2 -
>  .../lustre/include/linux/libcfs/libcfs_ioctl.h |1 +
>  .../staging/lustre/include/linux/lnet/lib-lnet.h   |5 +-
>  .../staging/lustre/include/linux/lnet/lib-types.h  |2 +-
>  drivers/staging/lustre/lnet/lnet/api-ni.c  |5 +-
>  drivers/staging/lustre/lnet/lnet/config.c  |   39 +-
>  drivers/staging/lustre/lnet/lnet/lib-move.c|6 +
>  drivers/staging/lustre/lnet/lnet/router.c  |   19 +-
>  drivers/staging/lustre/lnet/lnet/router_proc.c |   16 +-
>  drivers/staging/lustre/lnet/selftest/conctl.c  |   56 +-
>  drivers/staging/lustre/lnet/selftest/conrpc.c  |2 +-
>  drivers/staging/lustre/lnet/selftest/console.c |  105 ++--
>  drivers/staging/lustre/lnet/selftest/console.h |6 +-
>  drivers/staging/lustre/lnet/selftest/rpc.c |2 -
>  drivers/staging/lustre/lnet/selftest/selftest.h|3 -
>  drivers/staging/lustre/lnet/selftest/timer.c   |6 +-
>  drivers/staging/lustre/lustre/include/cl_object.h  |6 +-
>  drivers/staging/lustre/lustre/include/dt_object.h  |2 +-
>  .../lustre/lustre/include/linux/lustre_compat25.h  |4 +-
>  .../lustre/lustre/include/linux/lustre_intent.h|2 +-
>  .../lustre/lustre/include/linux/lustre_lite.h  |1 +
>  drivers/staging/lustre/lustre/include/lu_object.h  |   19 -
>  drivers/staging/lustre/lustre/include/lu_target.h  |   91 ---
>  .../lustre/lustre/include/lustre/lustre_idl.h  |  111 ++--
>  .../lustre/lustre/include/lustre/lustre_user.h |   63 +-
>  .../lustre/lustre/include/lustre/lustreapi.h   |  208 ---
>  drivers/staging/lustre/lustre/include/lustre_cfg.h |2 +
>

Re: [PATCH 04/40] staging/lustre/llite: Access to released file trigs a restore

2013-11-14 Thread Greg Kroah-Hartman

On Fri, Nov 15, 2013 at 12:13:06AM +0800, Peng Tao wrote:
> From: JC Lafoucriere 
> 
> When a client accesses data in a released file,
> or truncate it, client must trig a restore request.
> During this restore, the client must not glimpse and
> must use size from MDT. To bring the "restore is running"
> information on the client we add a new t_state bit field
> to mdt_info which will be used to carry transient file state.
> To memorise this information in the inode we add a new flag
> LLIF_FILE_RESTORING.

This patch also does other things not mentioned here (coding style
cleanups), which isn't allowed in a single patch (only do one thing per
patch, and never not document what you are doing...)

It also adds checkpatch warnings, which I will not accept in patches at
all here.  People are spending a lot of time cleaning up the coding
style issues, please NEVER add new ones, that just causes more work to
be needed to be done, and for people to have to go back and reclean
files they have already cleaned up.

So, sorry, I have to stop here at this series.  I've applied the first 3
to the opw-next branch of staging.git so they can live somewhere until
3.13-rc1 is out.

I know you spent a lot of time making these 120 patches to send me, but
that too is crazy.  You shouldn't wait that long to get feedback and
send patches to me at all.  Please send them in smaller series, with
less time between patch submissions.

So, care to just send me 10 patches or so now, which I can review and
accept if good, and we can sync up and continue from there?

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] perf top: Make -g refer to callchains

2013-11-14 Thread Ingo Molnar


btw., here's some 'perf top' call graph performance and profiling 
quality feedback, with the latest perf code:

'perf top --call-graph fp' now works very well, using just 0.2% 
of CPU time on a fast system:

 4676 mingo 20   0  612m  56m 9948 S 1  0.2   0:00.68 perf  

  

'perf top --call-graph dwarf' on the other hand is horrendously 
slow, using 20% of CPU time on a 4 GHz CPU:

  PID USER  PR  NI  VIRT  RES  SHR S  %CPU %MEMTIME+  COMMAND   

  
 4646 mingo 20   0  658m  81m  12m R19  0.3   0:18.17 perf

On another system with a 2.4GHz CPU it's taking up 100% of CPU 
time (!):

  PID USER  PR  NIVIRTRESSHR S  %CPU %MEM TIME+ COMMAND 

  
 8018 mingo 20   0  290320  45220   8520 R  99.5  0.3   0:58.81 perf  

Profiling 'perf top' shows all sorts of very high dwarf 
processing overhead:

#
# Overhead  Command  Shared Object  
   Symbol
#   ...  .  
.
#
 7.08% perf  perf   [.] access_mem
 7.03% perf  perf   [.] dso__data_read_offset
 5.83% perf  perf   [.] maps__find
 5.64% perf  libunwind-x86_64.so.8.0.1  [.] 0xba25
 4.75% perf  perf   [.] thread__find_addr_map
 3.81% perf  [kernel.kallsyms]  [k] unmap_single_vma
 2.57% perf  perf   [.] map__map_ip
 2.48% perf  libelf-0.156.so[.] 0x3a84
 2.12% perf  [kernel.kallsyms]  [k] memset
 2.12% perf  perf   [.] dso__data_read_addr
 2.10% perf  libc-2.17.so   [.] __memcpy_sse2
 1.72% perf  libc-2.17.so   [.] __memset_sse2
 1.58% perf  [kernel.kallsyms]  [k] page_fault
 1.56% perf  libc-2.17.so   [.] __memset_x86_64
 1.44% perf  perf   [.] find_proc_info
 1.25% perf  libelf-0.156.so[.] elf_end
 1.19% perf  [kernel.kallsyms]  [k] flush_tlb_mm_range
 1.06% perf  libc-2.17.so   [.] vfprintf
 1.04% perf  libunwind-x86_64.so.8.0.1  [.] 
_Ux86_64_dwarf_search_unwind_table
 1.00% perf  [kernel.kallsyms]  [k] __audit_syscall_exit
 0.94% perf  libc-2.17.so   [.] _int_free
 0.92% perf  libc-2.17.so   [.] _int_malloc
 0.84% perf  libc-2.17.so   [.] __memcmp_sse2
 0.81% perf  [kernel.kallsyms]  [k] unmapped_area_topdown
 0.71% perf  [kernel.kallsyms]  [k] system_call
 0.71% perf  [kernel.kallsyms]  [k] system_call_after_swapgs
 0.65% perf  [kernel.kallsyms]  [k] sysret_check
 0.63% perf  perf   [.] dso__find_symbol
 0.58% perf  [kernel.kallsyms]  [k] clear_page_c
 0.58% perf  [kernel.kallsyms]  [k] handle_mm_fault
 0.56% perf  libc-2.17.so   [.] __sigprocmask
 0.55% perf  [kernel.kallsyms]  [k] copy_user_generic_string
 0.51% perf  [kernel.kallsyms]  [k] __do_fault
 0.49% perf  [kernel.kallsyms]  [k] find_vma
 0.47% perf  libpthread-2.17.so [.] __libc_close
 0.44% perf  [kernel.kallsyms]  [k] __audit_syscall_entry
 0.44% perf  [kernel.kallsyms]  [k] mmap_region
 0.42% perf  [kernel.kallsyms]  [k] _raw_spin_lock
 0.41% perf  [kernel.kallsyms]  [k] kmem_cache_free
 0.40% perf  [kernel.kallsyms]  [k] kmem_cache_alloc
 0.40% perf  libpthread-2.17.so [.] pthread_mutex_unlock
 0.37% perf  [kernel.kallsyms]  [k] perf_event_aux_ctx
 0.37% perf  [kernel.kallsyms]  [k] do_munmap
 0.37% perf  libc-2.17.so   [.] free
 [...]

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 07/40] staging/lustre/hsm: Implementation of exclusive open

2013-11-14 Thread Greg Kroah-Hartman

On Fri, Nov 15, 2013 at 12:13:09AM +0800, Peng Tao wrote:
> From: Jinshan Xiong 
> 
> In this patch, a framework of lease is implemented. However,
> only exclusive lease is supported right now.

Which is a problem, why?

Why do you want to add this new functionality?  What is it good for?
Who will use it?

Oh, and the patch doesn't apply to my tree, AND it contains a bunch of
foolish coding style errors, so even if I did understand what this was
for, and why we wanted it, I couldn't.

Please be more careful, I'm really stopping here now on reviewing these
lustre patches.  Please clean them up, resync, and try again, I've now
purged them all from my mboxes, sorry.

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 01/39] staging/lustre/hsm: remove hsm_nl proc file

2013-11-14 Thread Greg Kroah-Hartman

On Fri, Nov 15, 2013 at 12:32:24AM +0800, Peng Tao wrote:
> From: "John L. Hammond" 
> 
> Remove the file /proc/fs/lustre/mdc/*/hsm_nl which was introduced "for
> testing purposes."

What does this mean?  Why is this allowed?  How do you know no one uses
it?  I need a much better changelog comment here, sorry.

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 12/40] staging/lustre/nfs: writing to new files will return ENOENT

2013-11-14 Thread Greg Kroah-Hartman

On Fri, Nov 15, 2013 at 12:28:20AM +0800, Peng Tao wrote:
> On Fri, Nov 15, 2013 at 12:22 AM, Cheng Shao  wrote:
> > Hi Peng,
> >
> > This patch was eventually reverted as it was causing interoperability 
> > issues with 2.1 Lustre server. We are working on the new patch, so please 
> > don't include this one.
> >
> Thanks for the notification. I didn't see the reverting patch in next
> 200 patches in Lustre tree so it must have happened recently.
> 
> Greg, please drop this one. I verified that the rest of the patchset
> can be applied cleanly.

Now dropped, thanks.

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

A problem about IO scheduler in kernel

2013-11-14 Thread 韩磊

These days I was programming about IO scheduler called
"Simple_Deadline" in kernel.
"Simple_Deadline" is based on "deadline-iosched".The algorithm is very
simple.It have two lists,one is read,the other is write.A request
enter lists based its weight which count accord to the request's
size,read or write. And when dispatch a request just compare the
weight between read list and write list,the smaller one dispatches.

When I modprobe this module and run it,if  a bit of IO come,it works
well.But when runs a large number IO,the system will crash.  Can you
help me to find the problem? I am so sad and helpless about it.

When system crashed,the screen display some information:

Message from syslogd@han at Nov 15 13:12:03 ...
 kernel:[ cut here ]

Message from syslogd@han at Nov 15 13:12:03 ...
 kernel:invalid opcode:  [#1] SMP

Message from syslogd@han at Nov 15 13:12:03 ...
 kernel:last sysfs file:
/sys/devices/pci:00/:00:04.0/:05:00.0/host0/port-0:0/end_device-0:0/target0:0:0/0:0:0:0/block/sda/queue/scheduler

Message from syslogd@han at Nov 15 13:12:03 ...
 kernel:Stack:

Message from syslogd@han at Nov 15 13:12:03 ...
 kernel:Call Trace:

Message from syslogd@han at Nov 15 13:12:03 ...
 kernel:Code: 2a 48 89 fe 4c 89 e7 e8 35 ff 01 00 48 8b 83 80 00 00 00
83 e0 03 4c 09 e0 4c 8b 64 24 08 48 89 83 80 00 00 00 48 8b 1c 24 c9
c3 <0f> 0b eb fe 66 90 55 48 89 e5 0f 1f 44 00 00 45 31 c0 48 89 f9


The code in accessory!  Please help find the bugs!  Thank you!
/*
 *  Deadline i/o scheduler.
 *
 *  Copyright (C) 2002 Jens Axboe 
 */
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

static const int read_expire = HZ / 2; 
static const int write_expire = 5 * HZ; 
static const int writes_starved = 2;
static const int fifo_batch = 16;  




#define DELTA 1

static const int read_time_per_byte = 30 ;  
static const int write_time_per_byte = 30  ; 


static const int read_time_once_transfer = 30 ; 
static const int write_time_once_transfer = 300 ; 

struct req_list_head
{
	struct request *current_req;
	unsigned long last_jiffies;
	long current_weight;
	struct req_list_head *next;
	struct req_list_head *prev;
};

struct deadline_data {

	struct req_list_head req_weight_list[2];
	struct request *next_rq[2];
	struct rb_root sort_list[2];	
	struct list_head fifo_list[2];
	unsigned int batching;		/* number of sequential requests made */  
	sector_t last_sector;		/* head position */
	unsigned int starved;		/* times reads have starved writes */
	int fifo_expire[2]; 
	int fifo_batch;
	int writes_starved;
	int front_merges;

};
void req_list_remove_request(struct req_list_head *list_head);
struct request *select_req_from_weight_list(struct req_list_head *req_write_list,struct req_list_head *req_read_list);
int req_list_empty(struct req_list_head *list_head);
void req_list_add(struct req_list_head *new_req,struct req_list_head *list_head);
void update_req_list_weight(struct req_list_head *list_head);
long req_list_count_weight(int data_dir,unsigned int bio_size);  
void Init_req_list_head(struct req_list_head *req_weight_list);
static void deadline_move_request(struct deadline_data *, struct request *);

static inline struct rb_root *
deadline_rb_root(struct deadline_data *dd, struct request *rq)
{
	return >sort_list[rq_data_dir(rq)];
}

static inline struct request *
deadline_latter_request(struct request *rq)
{
	struct rb_node *node = rb_next(>rb_node);

	if (node)
		return rb_entry_rq(node);

	return NULL;
}

static void
deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
{
	struct rb_root *root = deadline_rb_root(dd, rq);
	struct request *__alias;

	while (unlikely(__alias = elv_rb_add(root, rq)))
		deadline_move_request(dd, __alias);
}

static inline void
deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
{
	const int data_dir = rq_data_dir(rq);

	if (dd->next_rq[data_dir] == rq)
		dd->next_rq[data_dir] = deadline_latter_request(rq);

	elv_rb_del(deadline_rb_root(dd, rq), rq);
}

/*
 * add rq to rbtree and fifo
 */
static void   
deadline_add_request(struct request_queue *q, struct request *rq)
{
	printk("enter merge_add_request\n");
	struct deadline_data *dd = q->elevator->elevator_data;
	const int data_dir = rq_data_dir(rq);
	
	unsigned int bio_size=rq->__data_len;


	struct req_list_head *req_list=(struct req_list_head *)kmalloc(sizeof(*req_list),GFP_KERNEL);
	req_list->current_req=rq;
	req_list->current_weight=req_list_count_weight(data_dir,bio_size);
	req_list->last_jiffies=jiffies;
	req_list_add(req_list,>req_weight_list[data_dir]);

	deadline_add_rq_rb(dd, rq);
	rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]);
	list_add_tail(>queuelist,

[PATCH] Revert "init/Kconfig: add option to disable kernel compression"

2013-11-14 Thread H. Peter Anvin

From: "H. Peter Anvin" 

This reverts commit 69f0554ec261fd686ac7fa1c598cc9eb27b83a80.

This patch breaks randconfig on at least the x86-64 architecture, and
most likely on others.  There is work underway to support uncompressed
kernels in a generic way, but it looks like it will amount to
rewriting the support from scratch; see the LKML thread in the Link:
for info.

Therefore, revert this change and wait for the fix.

Reported-by: Pavel Roskin 
Cc: Christian Ruppert 
Cc: Andrew Morton 
Link: http://lkml.kernel.org/r/20131113113418.167b8ffd@IRBT4585
Signed-off-by: H. Peter Anvin 
---
 init/Kconfig | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 5496f307988e..bc8911fab28e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -118,6 +118,7 @@ config HAVE_KERNEL_LZ4
 choice
prompt "Kernel compression mode"
default KERNEL_GZIP
+   depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || 
HAVE_KERNEL_XZ || HAVE_KERNEL_LZO || HAVE_KERNEL_LZ4
help
  The linux kernel is a kind of self-extracting executable.
  Several compression algorithms are available, which differ
@@ -136,13 +137,6 @@ choice
 
  If in doubt, select 'gzip'
 
-config KERNEL_UNCOMPRESSED
-   bool "No compression"
-   help
- No compression at all. The kernel is huge but the compression and
- decompression times are zero.
- This is usually not what you want.
-
 config KERNEL_GZIP
bool "Gzip"
depends on HAVE_KERNEL_GZIP
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] perf top: Make -g refer to callchains

2013-11-14 Thread Ingo Molnar


* David Ahern  wrote:

> --- a/tools/perf/Documentation/perf-top.txt
> +++ b/tools/perf/Documentation/perf-top.txt
> @@ -143,12 +142,12 @@ Default is to monitor all CPUS.
>  --asm-raw::
>   Show raw instruction encoding of assembly instructions.
>  
> --G::
> +-g::
>   Enables call-graph (stack chain/backtrace) recording.
>  
>  --call-graph::
>   Setup and enable call-graph (stack chain/backtrace) recording,
> - implies -G.
> + implies -g.

Acked-by: Ingo Molnar 

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

linux-next: Tree for Nov 15

2013-11-14 Thread Stephen Rothwell

Hi all,

Please do *not* add any v3.14 material to linux-next until after
v3.13-rc1 is released.

Changes since 20131114:

The idle tree gained a conflict against the pm tree.

The dmaengine tree gained a conflict against the slave-dma tree.

The akpm tree gained a conflict against the sparc-next tree.



I have created today's linux-next tree at
git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
(patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
are tracking the linux-next tree using git, you should not use "git pull"
to do so as that will try to merge the new linux-next release with the
old one.  You should use "git fetch" as mentioned in the FAQ on the wiki
(see below).

You can see which trees have been included by looking in the Next/Trees
file in the source.  There are also quilt-import.log and merge.log files
in the Next directory.  Between each merge, the tree was built with
a ppc64_defconfig for powerpc and an allmodconfig for x86_64 and a
multi_v7_defconfig for arm. After the final fixups (if any), it is also
built with powerpc allnoconfig (32 and 64 bit), ppc44x_defconfig and
allyesconfig (minus CONFIG_PROFILE_ALL_BRANCHES - this fails its final
link) and i386, sparc, sparc64 and arm defconfig. These builds also have
CONFIG_ENABLE_WARN_DEPRECATED, CONFIG_ENABLE_MUST_CHECK and
CONFIG_DEBUG_INFO disabled when necessary.

Below is a summary of the state of the merge.

I am currently merging 210 trees (counting Linus' and 29 trees of patches
pending for Linus' tree), more are welcome (even if they are currently
empty). Thanks to those who have contributed, and to those who haven't,
please do.

Status of my local build tests will be at
http://kisskb.ellerman.id.au/linux-next .  If maintainers want to give
advice about cross compilers/configs that work, we are always open to add
more builds.

Thanks to Randy Dunlap for doing many randconfig builds.  And to Paul
Gortmaker for triage and bug fixes.

There is a wiki covering stuff to do with linux-next at
http://linux.f-seidel.de/linux-next/pmwiki/ .  Thanks to Frank Seidel.

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au

$ git checkout master
$ git reset --hard stable
Merging origin/master (4fbf888accb3 Merge tag 'ext4_for_linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4)
Merging fixes/master (fa8218def1b1 Merge tag 'regmap-v3.11-rc7' of 
git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap)
Merging kbuild-current/rc-fixes (19514fc665ff arm, kbuild: make "make install" 
not depend on vmlinux)
Merging arc-current/for-curr (737d5b980be8 ARC: [plat-arcfpga] defconfig update)
Merging arm-current/fixes (6ecf830e5029 ARM: 7880/1: Clear the IT state 
independent of the Thumb-2 mode)
Merging m68k-current/for-linus (77a42796786c m68k: Remove deprecated 
IRQF_DISABLED)
Merging metag-fixes/fixes (3b2f64d00c46 Linux 3.11-rc2)
Merging powerpc-merge/merge (8b5ede69d24d powerpc/irq: Don't switch to irq 
stack from softirq stack)
Merging sparc/master (6d15ee492809 Merge 
git://git.kernel.org/pub/scm/virt/kvm/kvm)
Merging net/master (c9e9042994d3 ipv4: fix possible seqlock deadlock)
Merging ipsec/master (be408cd3e1fe Merge 
git://git.kernel.org/pub/scm/linux/kernel/git/davem/net)
Merging sound-current/for-linus (32b8544296b9 ALSA: jack: Unregister input 
device at disconnection)
Merging pci-current/for-linus (d43ed14b737c PCI: Fix whitespace, 
capitalization, and spelling errors)
Merging wireless/master (8e3ffa471091 prism54: set netdev type to "wlan")
Merging driver-core.current/driver-core-linus (31d141e3a666 Linux 3.12-rc6)
Merging tty.current/tty-linus (6e757ad2c92c tty/serial: at91: fix uart/usart 
selection for older products)
Merging usb.current/usb-linus (e1466ad5b1ae USB: serial: ftdi_sio: add id for 
Z3X Box device)
Merging staging.current/staging-linus (31d141e3a666 Linux 3.12-rc6)
Merging char-misc.current/char-misc-linus (31d141e3a666 Linux 3.12-rc6)
Merging input-current/for-linus (5beea882e641 Input: ALPS - add support for 
model found on Dell XT2)
Merging md-current/for-linus (d47648fcf061 raid5: avoid finding "discard" 
stripe)
Merging crypto-current/master (f262f0f5cad0 crypto: s390 - Fix aes-cbc IV 
corruption)
CONFLICT (content): Merge conflict in drivers/crypto/caam/jr.c
Merging ide/master (64110c16e012 ide: sgiioc4: Staticize ioc4_ide_attach_one())
Merging dwmw2/master (5950f0803ca9 pcmcia: remove RPX board stuff)
Merging sh-current/sh-fixes-for-linus (44033109e99c SH: Convert out[bwl] macros 
to inline functions)
Merging devicetree-current/devicetree/merge (1931ee143b0a Revert "drivers: of: 
add initialization code for dma reserved memory")
Merging rr-fixes/fixes (f6537f2f0eba scripts/kallsyms: filter symbols not in 
kernel address space)
Merging mfd-fixes/master (d0e639c9e06d Linux 3.12-rc4)
Merging vfio-fixes/for-linus (d

Re: linux-next: manual merge of the akpm tree with the sparc-next tree

2013-11-14 Thread David Miller

From: Stephen Rothwell 
Date: Fri, 15 Nov 2013 16:00:00 +1100

> [Forgot to cc Dave]

Yes, I told Linus about this conflict when I sent in my sparc pull
request this afternoon.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/5] f2fs: introduce f2fs_issue_discard() to clean up

2013-11-14 Thread Jaegeuk Kim

This patch adds f2fs_issue_discard() to clean up blkdev_issue_discard() flows.

Signed-off-by: Jaegeuk Kim 
---
 fs/f2fs/segment.c | 23 +--
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 505a889..d021cf3 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -266,6 +266,16 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, 
unsigned int segno)
mutex_unlock(_i->seglist_lock);
 }
 
+static void f2fs_issue_discard(struct f2fs_sb_info *sbi,
+   block_t blkstart, block_t blklen)
+{
+   sector_t sector_addr = blkstart << sbi->log_sectors_per_block;
+   sector_t sector_len = blklen << sbi->log_sectors_per_block;
+
+   blkdev_issue_discard(sbi->sb->s_bdev, sector_addr, sector_len,
+   GFP_NOFS, 0);
+}
+
 static void add_discard_addrs(struct f2fs_sb_info *sbi,
unsigned int segno, struct seg_entry *se)
 {
@@ -354,22 +364,15 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
if (!test_opt(sbi, DISCARD))
continue;
 
-   blkdev_issue_discard(sbi->sb->s_bdev,
-   START_BLOCK(sbi, start) <<
-   sbi->log_sectors_per_block,
-   (1 << (sbi->log_sectors_per_block +
-   sbi->log_blocks_per_seg)) * (end - start),
-   GFP_NOFS, 0);
+   f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
+   (end - start) << sbi->log_blocks_per_seg);
}
mutex_unlock(_i->seglist_lock);
 
/* send small discards */
list_for_each_safe(this, next, head) {
entry = list_entry(this, struct discard_entry, list);
-   blkdev_issue_discard(sbi->sb->s_bdev,
-   entry->blkaddr << sbi->log_sectors_per_block,
-   (1 << sbi->log_sectors_per_block) * entry->len,
-   GFP_NOFS, 0);
+   f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
list_del(>list);
SM_I(sbi)->nr_discards -= entry->len;
kmem_cache_free(discard_entry_slab, entry);
-- 
1.8.4.474.g128a96c

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/5] f2fs: add a sysfs entry to control max_discards

2013-11-14 Thread Jaegeuk Kim

If frequent small discards are issued to the device, the performance would
be degraded significantly.
So, this patch adds a sysfs entry to control the number of discards to be
issued during a checkpoint procedure.

By default, f2fs does not issue any small discards, which means max_discards
is zero.

Signed-off-by: Jaegeuk Kim 
---
 fs/f2fs/super.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index e9aa3f7..a022412 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -175,6 +175,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, 
max_sleep_time);
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, 
no_gc_sleep_time);
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
 
 #define ATTR_LIST(name) (_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -183,6 +184,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_no_gc_sleep_time),
ATTR_LIST(gc_idle),
ATTR_LIST(reclaim_segments),
+   ATTR_LIST(max_small_discards),
NULL,
 };
 
-- 
1.8.4.474.g128a96c

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 5/5] f2fs: add a tracepoint for f2fs_issue_discard

2013-11-14 Thread Jaegeuk Kim

This patch adds a tracepoint for f2fs_issue_discard.

Signed-off-by: Jaegeuk Kim 
---
 fs/f2fs/segment.c   |  1 +
 include/trace/events/f2fs.h | 23 +++
 2 files changed, 24 insertions(+)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d021cf3..1f83999 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -274,6 +274,7 @@ static void f2fs_issue_discard(struct f2fs_sb_info *sbi,
 
blkdev_issue_discard(sbi->sb->s_bdev, sector_addr, sector_len,
GFP_NOFS, 0);
+   trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
 }
 
 static void add_discard_addrs(struct f2fs_sb_info *sbi,
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index e0dc355..47ee70d 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -727,6 +727,29 @@ TRACE_EVENT(f2fs_write_checkpoint,
__entry->msg)
 );
 
+TRACE_EVENT(f2fs_issue_discard,
+
+   TP_PROTO(struct super_block *sb, block_t blkstart, block_t blklen),
+
+   TP_ARGS(sb, blkstart, blklen),
+
+   TP_STRUCT__entry(
+   __field(dev_t,  dev)
+   __field(block_t, blkstart)
+   __field(block_t, blklen)
+   ),
+
+   TP_fast_assign(
+   __entry->dev= sb->s_dev;
+   __entry->blkstart = blkstart;
+   __entry->blklen = blklen;
+   ),
+
+   TP_printk("dev = (%d,%d), blkstart = 0x%llx, blklen = 0x%llx",
+   show_dev(__entry),
+   (unsigned long long)__entry->blkstart,
+   (unsigned long long)__entry->blklen)
+);
 #endif /* _TRACE_F2FS_H */
 
  /* This part must be outside protection */
-- 
1.8.4.474.g128a96c

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/5] f2fs: add key functions for small discards

2013-11-14 Thread Jaegeuk Kim

This patch adds key functions to activate the small discard feature.

Note that this procedure is conducted during the checkpoint only.

In flush_sit_entries(), when a new dirty sit entry is flushed, f2fs calls
add_discard_addrs() which searches candidates to be discarded.
The candidates should be marked *invalidated* and also previous checkpoint
recognizes it as *valid*.

At the end of a checkpoint procedure, f2fs throws discards based on the
discard entry list.

Signed-off-by: Jaegeuk Kim 
---
 fs/f2fs/segment.c | 60 +++
 1 file changed, 60 insertions(+)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 823526e..505a889 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -266,6 +266,47 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, 
unsigned int segno)
mutex_unlock(_i->seglist_lock);
 }
 
+static void add_discard_addrs(struct f2fs_sb_info *sbi,
+   unsigned int segno, struct seg_entry *se)
+{
+   struct list_head *head = _I(sbi)->discard_list;
+   struct discard_entry *new;
+   int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
+   int max_blocks = sbi->blocks_per_seg;
+   unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
+   unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
+   unsigned long dmap[entries];
+   unsigned int start = 0, end = -1;
+   int i;
+
+   if (!test_opt(sbi, DISCARD))
+   return;
+
+   /* zero block will be discarded through the prefree list */
+   if (!se->valid_blocks || se->valid_blocks == max_blocks)
+   return;
+
+   /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
+   for (i = 0; i < entries; i++)
+   dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
+
+   while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
+   start = __find_rev_next_bit(dmap, max_blocks, end + 1);
+   if (start >= max_blocks)
+   break;
+
+   end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
+
+   new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
+   INIT_LIST_HEAD(>list);
+   new->blkaddr = START_BLOCK(sbi, segno) + start;
+   new->len = end - start;
+
+   list_add_tail(>list, head);
+   SM_I(sbi)->nr_discards += end - start;
+   }
+}
+
 /*
  * Should call clear_prefree_segments after checkpoint is done.
  */
@@ -288,6 +329,9 @@ static void set_prefree_as_free_segments(struct 
f2fs_sb_info *sbi)
 
 void clear_prefree_segments(struct f2fs_sb_info *sbi)
 {
+   struct list_head *head = &(SM_I(sbi)->discard_list);
+   struct list_head *this, *next;
+   struct discard_entry *entry;
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
unsigned int total_segs = TOTAL_SEGS(sbi);
@@ -318,6 +362,18 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
GFP_NOFS, 0);
}
mutex_unlock(_i->seglist_lock);
+
+   /* send small discards */
+   list_for_each_safe(this, next, head) {
+   entry = list_entry(this, struct discard_entry, list);
+   blkdev_issue_discard(sbi->sb->s_bdev,
+   entry->blkaddr << sbi->log_sectors_per_block,
+   (1 << sbi->log_sectors_per_block) * entry->len,
+   GFP_NOFS, 0);
+   list_del(>list);
+   SM_I(sbi)->nr_discards -= entry->len;
+   kmem_cache_free(discard_entry_slab, entry);
+   }
 }
 
 static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int 
segno)
@@ -1469,6 +1525,10 @@ void flush_sit_entries(struct f2fs_sb_info *sbi)
 
sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
 
+   /* add discard candidates */
+   if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards)
+   add_discard_addrs(sbi, segno, se);
+
if (flushed)
goto to_sit_page;
 
-- 
1.8.4.474.g128a96c

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/5] f2fs: add a slab cache entry for small discards

2013-11-14 Thread Jaegeuk Kim

This patch adds a slab cache entry for small discards.

Each entry consists of:

struct discard_entry {
struct list_head list;  /* list head */
block_t blkaddr;/* block address to be discarded */
int len;/* # of consecutive blocks of the discard */
};

Signed-off-by: Jaegeuk Kim 
---
 fs/f2fs/f2fs.h| 14 ++
 fs/f2fs/segment.c | 20 
 fs/f2fs/super.c   |  7 ++-
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 89dc750..c73e3df 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -97,6 +97,13 @@ struct dir_inode_entry {
struct inode *inode;/* vfs inode pointer */
 };
 
+/* for the list of blockaddresses to be discarded */
+struct discard_entry {
+   struct list_head list;  /* list head */
+   block_t blkaddr;/* block address to be discarded */
+   int len;/* # of consecutive blocks of the discard */
+};
+
 /* for the list of fsync inodes, used only during recovery */
 struct fsync_inode_entry {
struct list_head list;  /* list head */
@@ -308,6 +315,11 @@ struct f2fs_sm_info {
 
/* a threshold to reclaim prefree segments */
unsigned int rec_prefree_segments;
+
+   /* for small discard management */
+   struct list_head discard_list;  /* 4KB discard list */
+   int nr_discards;/* # of discards in the list */
+   int max_discards;   /* max. discards to be issued */
 };
 
 /*
@@ -1079,6 +1091,8 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *,
 void flush_sit_entries(struct f2fs_sb_info *);
 int build_segment_manager(struct f2fs_sb_info *);
 void destroy_segment_manager(struct f2fs_sb_info *);
+int __init create_segment_manager_caches(void);
+void destroy_segment_manager_caches(void);
 
 /*
  * checkpoint.c
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 67f1e5b..823526e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -22,6 +22,8 @@
 
 #define __reverse_ffz(x) __reverse_ffs(~(x))
 
+static struct kmem_cache *discard_entry_slab;
+
 /*
  * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
  * MSB and LSB are reversed in a byte by f2fs_set_bit.
@@ -1798,6 +1800,10 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS;
 
+   INIT_LIST_HEAD(_info->discard_list);
+   sm_info->nr_discards = 0;
+   sm_info->max_discards = 0;
+
err = build_sit_info(sbi);
if (err)
return err;
@@ -1913,3 +1919,17 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
sbi->sm_info = NULL;
kfree(sm_info);
 }
+
+int __init create_segment_manager_caches(void)
+{
+   discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
+   sizeof(struct discard_entry), NULL);
+   if (!discard_entry_slab)
+   return -ENOMEM;
+   return 0;
+}
+
+void destroy_segment_manager_caches(void)
+{
+   kmem_cache_destroy(discard_entry_slab);
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index bafff72..e9aa3f7 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1078,9 +1078,12 @@ static int __init init_f2fs_fs(void)
err = create_node_manager_caches();
if (err)
goto free_inodecache;
-   err = create_gc_caches();
+   err = create_segment_manager_caches();
if (err)
goto free_node_manager_caches;
+   err = create_gc_caches();
+   if (err)
+   goto free_segment_manager_caches;
err = create_checkpoint_caches();
if (err)
goto free_gc_caches;
@@ -1102,6 +1105,8 @@ free_checkpoint_caches:
destroy_checkpoint_caches();
 free_gc_caches:
destroy_gc_caches();
+free_segment_manager_caches:
+   destroy_segment_manager_caches();
 free_node_manager_caches:
destroy_node_manager_caches();
 free_inodecache:
-- 
1.8.4.474.g128a96c

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Updating 00-INDEX in Documentation/*

2013-11-14 Thread Rob Landley


On 10/28/2013 08:05:11 AM, Henrik Austad wrote:

Hi Rob, Jiri
Hacking away at python shows me that of the 254 subfolders 57 has  
outdated
00-INDEX, either with missing files, or files that has been removed.  
(see

list below)


I'm a bit behind on my email just now. (Started a new job last month,  
everything else got derailed for a bit.)


Now, I'm quite happy to start fixing this, but what I would like to  
know

before I start:

- is it considered a waste of time? i.e. will it be worth the time
- what is best, a single, big patch, or a series of one patch per
  directory?
- or should someone(tm) kick whoever caused 00-INDEX to be outdated,  
to fix

  it?


Go for it.

I've had a script that does this since 2007, but it's in the context of  
creating html indexes for the kernel.org/doc/Documentation directory,  
and after the kernel.org breakin they never gave me rsync access back  
(because it's not built into git and all any server ever does is run  
git, right?) so I couldn't update it anymore. Then they decided that  
the Documentation directory should just be a raw checkout from git with  
no html indexes (because git), and presumably they'll be doing the same  
for
http://kernel.org/doc/menuconfig any day now (how, I couldn't tell you,  
but it's apparently no longer my problem)...


So it's kinda hard to get enthused about it these days. I wanted to  
revive my old push to reorganize the documentation directory  
(https://lkml.org/lkml/2007/5/22/473 and such) but if the word from on  
high is that a giant raw pile of unsorted files is the optimal way to  
organize things, who am I to argue?


Same problem for kernel.org/doc/htmldocs: the ones I did way back when  
(https://web.archive.org/web/20090327025639/http://www.kernel.org/doc/htmldocs/)  
had both the "one big html file" version and the lots of little files  
version and I'd watch the build logs and send patches to get the  
warnings down, but they took that away and replaced it without even  
letting me know they were doing that, so I've fallen out of the habit.  
(My understanding was that the kernel.org guys officially don't care  
about any data that isn't accessed through git, and a web browser isn't  
git, so...)


*shrug* I suppose all it's still on the todo list somewhere. Maybe  
somewhere after reviving http://landley.net/qemu and catching that up  
to current, but it's in there...


Rob--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: call_usermodehelper in containers

2013-11-14 Thread Eric W. Biederman

Jeff Layton  writes:

> On Tue, 12 Nov 2013 17:02:36 +0400
> Stanislav Kinsbursky  wrote:
>
>> 12.11.2013 15:12, Jeff Layton пишет:
>> > On Mon, 11 Nov 2013 16:47:03 -0800
>> > Greg KH  wrote:
>> >
>> >> On Mon, Nov 11, 2013 at 07:18:25AM -0500, Jeff Layton wrote:
>> >>> We have a bit of a problem wrt to upcalls that use call_usermodehelper
>> >>> with containers and I'd like to bring this to some sort of resolution...
>> >>>
>> >>> A particularly problematic case (though there are others) is the
>> >>> nfsdcltrack upcall. It basically uses call_usermodehelper to run a
>> >>> program in userland to track some information on stable storage for
>> >>> nfsd.
>> >>
>> >> I thought the discussion at the kernel summit about this issue was:
>> >>   - don't do this.
>> >>   - don't do it.
>> >>   - if you really need to do this, fix nfsd
>> >>
>> >
>> > Sorry, I couldn't make the kernel summit so I missed that discussion. I
>> > guess LWN didn't cover it?
>> >
>> > In any case, I guess then that we'll either have to come up with some
>> > way to fix nfsd here, or simply ensure that nfsd can never be started
>> > unless root in the container has a full set of a full set of
>> > capabilities.
>> >
>> > One sort of Rube Goldberg possibility to fix nfsd is:
>> >
>> > - when we start nfsd in a container, fork off an extra kernel thread
>> >that just sits idle. That thread would need to be a descendant of the
>> >userland process that started nfsd, so we'd need to create it with
>> >kernel_thread().
>> >
>> > - Have the kernel just start up the UMH program in the init_ns mount
>> >namespace as it currently does, but also pass the pid of the idle
>> >kernel thread to the UMH upcall.
>> >
>> > - The program will then use /proc//root and /proc//ns/* to set
>> >itself up for doing things properly.
>> >
>> > Note that with this mechanism we can't actually run a different binary
>> > per container, but that's probably fine for most purposes.
>> >
>> 
>> Hmmm... Why we can't? We can go a bit further with userspace idea.
>> 
>> We use UMH some very limited number of user programs. For 2, actually:
>> 1) /sbin/nfs_cache_getent
>> 2) /sbin/nfsdcltrack
>> 
>
> No, the kernel uses them for a lot more than that. Pretty much all of
> the keys API upcalls use it. See all of the callers of
> call_usermodehelper. All of them are running user binaries out of the
> kernel, and almost all of them are certainly broken wrt containers.

Broken in the sense that we don't run them in the container yes.

I tried using the keys api for the uid mapping of containers and I wound
up being very disappointed because for testing/debugging I could never
flush any result I had ever returned a key for.  Which rather soured me
on the real-world usability of the key based user mode helpers.  Perhaps
I was doing it wrong but it seemed like a very brittle interface, that
was intollerant of human failures.

>> If we convert them into proxies, which use /proc//root and 
>> /proc//ns/*, this will allow us to lookup the right binary.
>> The only limitation here is presence of this "proxy" binaries on "host".
>> 
>
> Suppose I spawn my own container as a user, using all of this spiffy
> new user namespace stuff. Then I make the kernel use
> call_usermodehelper to call the upcall in the init_ns, and then trick
> it into running my new "escape_from_namespace" program with "real" root
> privileges.
>
> I don't think we can reasonably assume that having the kernel exec an
> arbitrary binary inside of a container is safe. Doing so inside of the
> init_ns is marginally more safe, but only marginally so...

One thing we have done with the core dump helper is because there is
enough information to know the namespaces of the program dumping core
have the root owned and installed helper use setns to get inside the
namespaces so we can have a per namespace core dump policy.

If we can provide enough context to the other helpers that is probably
the easiest way to go.

The question is can we truly pass enough state.

>> And we don't need any significant changes in kernel.
>> 
>> BTW, Jeff, could you remind me, please, why exactly we need to use UMH to 
>> run the binary?
>> What are this capabilities, which force us to do so?
>> 
>
> Nothing _forces_ us to do so, but upcalls are very difficult to handle,
> and UMH has a lot of advantages over a long-running daemon launched by
> userland.
>
> Originally, I created the nfsdcltrack upcall as a running daemon called
> nfsdcld, and the kernel used rpc_pipefs to communicate with it.
>
> Everyone hated it because no one likes to have to run daemons for
> infrequently used upcalls. It's a pain for users to ensure that it's
> running and it's a pain to handle when it isn't. So, I was encouraged
> to turn that instead into a UMH upcall.
>
> But leaving that aside, this problem is a lot larger than just nfsd. We
> have a *lot* of UMH upcalls in the kernel, so this problem is more
> general than just "fixing"

Re: linux-next: manual merge of the akpm tree with the sparc-next tree

2013-11-14 Thread Stephen Rothwell

[Forgot to cc Dave]

On Fri, 15 Nov 2013 15:57:33 +1100 Stephen Rothwell  
wrote:
>
> Hi Andrew,
> 
> Today's linux-next merge of the akpm tree got a conflict in
> arch/sparc/mm/init_64.c between commit 37b3a8ff3e08 ("sparc64: Move from
> 4MB to 8MB huge pages") from the sparc-next tree and commit "sparc:
> handle pgtable_page_ctor() fail" from the akpm tree.
> 
> I fixed it up (see below) and can carry the fix as necessary (no action
> is required).
> 
> -- 
> Cheers,
> Stephen Rothwells...@canb.auug.org.au
> 
> diff --cc arch/sparc/mm/init_64.c
> index bd6430ded69f,d6de9353ee11..
> --- a/arch/sparc/mm/init_64.c
> +++ b/arch/sparc/mm/init_64.c
> @@@ -2563,16 -2511,21 +2563,16 @@@ pte_t *pte_alloc_one_kernel(struct mm_s
>   pgtable_t pte_alloc_one(struct mm_struct *mm,
>   unsigned long address)
>   {
>  -struct page *page;
>  -pte_t *pte;
>  -
>  -pte = get_from_cache(mm);
>  -if (pte)
>  -return pte;
>  +struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
>  +   __GFP_REPEAT | __GFP_ZERO);
> - pte_t *pte = NULL;
>   
> - if (page) {
> - pgtable_page_ctor(page);
> - pte = (pte_t *) page_address(page);
>  -page = __alloc_for_cache(mm);
> + if (!page)
> + return NULL;
> + if (!pgtable_page_ctor(page)) {
> + free_hot_cold_page(page, 0);
> + return NULL;
>   }
> - 
> - return pte;
> + return (pte_t *) page_address(page);
>   }
>   
>   void pte_free_kernel(struct mm_struct *mm, pte_t *pte)


pgpDUd_ciYf9H.pgp
Description: PGP signature

Re: [REVIEW][PATCH 1/2] userns: Better restrictions on when proc and sysfs can be mounted

2013-11-14 Thread Eric W. Biederman

Gao feng  writes:

> On 11/15/2013 12:54 AM, Andy Lutomirski wrote:
>> On Thu, Nov 14, 2013 at 3:10 AM, Gao feng  wrote:
>>> On 11/13/2013 03:26 PM, Gao feng wrote:
 On 11/09/2013 01:42 PM, Eric W. Biederman wrote:
> Right now I would rather not have the empty directory exception than
> remove this code.
>
> The test is a little trickier to write than it might otherwise be
> because /proc and /sys tend to be slightly imperfect filesystems.
>
> I think the only way to really test that is to call readdir on the
> directory itself :(  I don't like that thought.
>
> I don't know what I was thinking when I wrote that test but I definitely
> goofed up.  Grr!
>
> I can certainly filter out any directory with nlink > 2.  That would be
> an easy partial step forward.
>
> The real question though is how do I detect directories it is safe to
> mount on where there will not be files in them.  I can't call iterate
> with the namespace_lock held so things are a bit tricky.
>

 I know this problem is not easy to be resolved. why not let the user
 make the decision?  maybe we can introduce a new mount option MS_LOCK,
 if user wants to use mount to hide something, he should use mount with
 option MS_LOCK. so the unpriviged user can't umount this filesystem and
 fail to mount the filesystem if one of it's child mount is mounted with
 MS_LOCK option otherwise he use MS_REC too.

>>>
>>> Something like this.
>>>
>>> From 437f33ea366623c7a9d557b2e84cae424876a44f Mon Sep 17 00:00:00 2001
>>> From: Gao feng 
>>> Date: Wed, 13 Nov 2013 16:06:46 +0800
>>> Subject: [PATCH] userns: introduce new mount option MS_LOCK
>>>
>>> After commit 5ff9d8a65ce80efb509ce4e8051394e9ed2cd942
>>> vfs: Lock in place mounts from more privileged users,
>>> in userns, the mounts of child mntns which copied from
>>> parent mntns is locked and user has no rights to umount/move
>>> them, it's too strict.
>>>
>>> The core purpose of above commit is trying to prevent
>>> unprivileged user from accessing files hidden by mount.
>>> This patch introduces a new mount option MS_LOCK, this
>>> gives user the capable to mount filesystem as the type
>>> of lock if he wants to use mount to hide something.
>>>
>> 
>> This is bad -- if something was secure in old kernels, it needs to
>> stay secure.  If you had MS_NOT_A_LOCK, that would be okay, but it
>> might not solve your problem.
>> 
>
> what you mean old kernels here? I saw patch "vfs: Lock in place mounts from 
> more privileged users"
> is merged into upstream in linux 3.12-rc1, this is not very old. I think there
> are not many userspace processes rely on this feature.

Sort of true.  Most people aren't that silly.  This feature was added to
defend against a theoretical attack that you can use with mount
namespaces.

In particular the scenario we are concerned with is:

Suppose the file system looks like:

Suppose there are two filesystems a and b that look like:

a:/usr/
a:/usr/my_very_secret_file
a:/dev/
a:/etc/
a:/lib/

b:/bin/
b:/etc/
b:/games/
b:/include/
b:/lib/
b:/lib32/
b:/local/
b:/sbin/
b:/share/
b:/src/

And filesystem b is mounted on a:/usr hiding a:/usr/my_very_secret_file

So the filesystem looks like:

/usr/
/usr/bin/
/usr/etc/
/usr/games/
/usr/include/
/usr/lib/
/usr/lib32/
/usr/local/
/usr/sbin/
/usr/share/
/usr/src/
/dev/
/etc/
/lib/

Without locking mounts into place an unprivileged user can clone the
mount namespace and do "umount /usr" and read /usr/my_very_secret_file.

Most systems don't hide sensitive things with mounts but it is very
possible and guarding against is fairly cheap and easy.  And while a
little annoying it should not be a large impediment to unprivileged user
of the user namespace because pivot root still works.

This thread started talking about bugs in fs_fully_visible.  And those
bugs are fixable and I aim to get to them shortly.  At the very least
I can lie and test for nlink <= 2 which fixes the regression in mounting
proc.

Then I can write the fun version that takes references and drops locks
so it can call the internal version of readdir to see if a directory is
actually empty.

But the principle remains the same we really don't want to reveal
anything that is hidden under a mount on purpose or by mistake.  Just
because then we don't have to think about those things from a security
point of view making everyone's life easier.

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -tip RFC v2 05/22] kprobes: Use NOKPROBE_SYMBOL() in sample modules

2013-11-14 Thread Masami Hiramatsu

Use NOKPROBE_SYMBOL() to protect handlers from kprobes
in sample modules.

Signed-off-by: Masami Hiramatsu 
Ananth N Mavinakayanahalli 
---
 samples/kprobes/jprobe_example.c|1 +
 samples/kprobes/kprobe_example.c|3 +++
 samples/kprobes/kretprobe_example.c |2 ++
 3 files changed, 6 insertions(+)

diff --git a/samples/kprobes/jprobe_example.c b/samples/kprobes/jprobe_example.c
index b754135..40114ac 100644
--- a/samples/kprobes/jprobe_example.c
+++ b/samples/kprobes/jprobe_example.c
@@ -35,6 +35,7 @@ static long jdo_fork(unsigned long clone_flags, unsigned long 
stack_start,
jprobe_return();
return 0;
 }
+NOKPROBE_SYMBOL(jdo_fork);
 
 static struct jprobe my_jprobe = {
.entry  = jdo_fork,
diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c
index 366db1a..462d90f 100644
--- a/samples/kprobes/kprobe_example.c
+++ b/samples/kprobes/kprobe_example.c
@@ -46,6 +46,7 @@ static int handler_pre(struct kprobe *p, struct pt_regs *regs)
/* A dump_stack() here will give a stack backtrace */
return 0;
 }
+NOKPROBE_SYMBOL(handler_pre);
 
 /* kprobe post_handler: called after the probed instruction is executed */
 static void handler_post(struct kprobe *p, struct pt_regs *regs,
@@ -68,6 +69,7 @@ static void handler_post(struct kprobe *p, struct pt_regs 
*regs,
p->addr, regs->ex1);
 #endif
 }
+NOKPROBE_SYMBOL(handler_post);
 
 /*
  * fault_handler: this is called if an exception is generated for any
@@ -81,6 +83,7 @@ static int handler_fault(struct kprobe *p, struct pt_regs 
*regs, int trapnr)
/* Return 0 because we don't handle the fault. */
return 0;
 }
+NOKPROBE_SYMBOL(handler_fault);
 
 static int __init kprobe_init(void)
 {
diff --git a/samples/kprobes/kretprobe_example.c 
b/samples/kprobes/kretprobe_example.c
index 1041b67..d932c52 100644
--- a/samples/kprobes/kretprobe_example.c
+++ b/samples/kprobes/kretprobe_example.c
@@ -47,6 +47,7 @@ static int entry_handler(struct kretprobe_instance *ri, 
struct pt_regs *regs)
data->entry_stamp = ktime_get();
return 0;
 }
+NOKPROBE_SYMBOL(entry_handler);
 
 /*
  * Return-probe handler: Log the return value and duration. Duration may turn
@@ -66,6 +67,7 @@ static int ret_handler(struct kretprobe_instance *ri, struct 
pt_regs *regs)
func_name, retval, (long long)delta);
return 0;
 }
+NOKPROBE_SYMBOL(ret_handler);
 
 static struct kretprobe my_kretprobe = {
.handler= ret_handler,

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -tip RFC v2 04/22] kprobes: Support blacklist functions in module

2013-11-14 Thread Masami Hiramatsu

To blacklist the functions in a module (e.g. user-defined
kprobe handler and the functions invoked from it), expand
blacklist support for modules.
With this change, users can use NOKPROBE_SYMBOL() macro in
their own modules.

Signed-off-by: Masami Hiramatsu 
Cc: Ananth N Mavinakayanahalli 
Cc: "David S. Miller" 
Cc: Rob Landley 
Cc: Rusty Russell 
---
 Documentation/kprobes.txt |8 
 include/linux/module.h|5 +
 kernel/kprobes.c  |   44 +---
 kernel/module.c   |6 ++
 4 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 7062631..c6634b3 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -512,6 +512,14 @@ int enable_jprobe(struct jprobe *jp);
 Enables *probe which has been disabled by disable_*probe(). You must specify
 the probe which has been registered.
 
+4.9 NOKPROBE_SYMBOL()
+
+#include 
+NOKPROBE_SYMBOL(FUNCTION);
+
+Protects given FUNCTION from other kprobes. This is useful for handler
+functions and functions called from the handlers.
+
 5. Kprobes Features and Limitations
 
 Kprobes allows multiple probes at the same address.  Currently,
diff --git a/include/linux/module.h b/include/linux/module.h
index 05f2447..acb682b 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -360,6 +361,10 @@ struct module
unsigned int num_ftrace_callsites;
unsigned long *ftrace_callsites;
 #endif
+#ifdef CONFIG_KPROBES
+   struct kprobe_blackpoint **kprobe_blacklist;
+   unsigned int num_kprobe_blacklist;
+#endif
 
 #ifdef CONFIG_MODULE_UNLOAD
/* What modules depend on me? */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 007235e..4e8ce87 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -88,6 +88,7 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long 
hash)
 
 /* Blacklist -- list of struct kprobe_blackpoint */
 static LIST_HEAD(kprobe_blacklist);
+static DEFINE_MUTEX(kprobe_blacklist_mutex);
 
 #ifdef __ARCH_WANT_KPROBES_INSN_SLOT
 /*
@@ -1416,6 +1417,7 @@ static __kprobes int check_kprobe_address_safe(struct 
kprobe *p,
 #endif
}
 
+   mutex_lock(_blacklist_mutex);
jump_label_lock();
preempt_disable();
 
@@ -1453,6 +1455,7 @@ static __kprobes int check_kprobe_address_safe(struct 
kprobe *p,
 out:
preempt_enable();
jump_label_unlock();
+   mutex_unlock(_blacklist_mutex);
 
return ret;
 }
@@ -2007,6 +2010,11 @@ void __kprobes dump_kprobe(struct kprobe *kp)
   kp->symbol_name, kp->addr, kp->offset);
 }
 
+static void populate_kprobe_blacklist(struct kprobe_blackpoint **start,
+ struct kprobe_blackpoint **end);
+static void shrink_kprobe_blacklist(struct kprobe_blackpoint **start,
+   struct kprobe_blackpoint **end);
+
 /* Module notifier call back, checking kprobes on the module */
 static int __kprobes kprobes_module_callback(struct notifier_block *nb,
 unsigned long val, void *data)
@@ -2017,6 +2025,16 @@ static int __kprobes kprobes_module_callback(struct 
notifier_block *nb,
unsigned int i;
int checkcore = (val == MODULE_STATE_GOING);
 
+   /* Add/remove module blacklist */
+   if (val == MODULE_STATE_COMING)
+   populate_kprobe_blacklist(mod->kprobe_blacklist,
+ mod->kprobe_blacklist +
+ mod->num_kprobe_blacklist);
+   else if (val == MODULE_STATE_GOING)
+   shrink_kprobe_blacklist(mod->kprobe_blacklist,
+   mod->kprobe_blacklist +
+   mod->num_kprobe_blacklist);
+
if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
return NOTIFY_DONE;
 
@@ -2050,6 +2068,18 @@ static struct notifier_block kprobe_module_nb = {
.priority = 0
 };
 
+/* Shrink the blacklist */
+static void shrink_kprobe_blacklist(struct kprobe_blackpoint **start,
+   struct kprobe_blackpoint **end)
+{
+   struct kprobe_blackpoint **iter;
+
+   mutex_lock(_blacklist_mutex);
+   for (iter = start; (unsigned long)iter < (unsigned long)end; iter++)
+   list_del(&(*iter)->list);
+   mutex_unlock(_blacklist_mutex);
+}
+
 /*
  * Lookup and populate the kprobe_blacklist.
  *
@@ -2058,14 +2088,15 @@ static struct notifier_block kprobe_module_nb = {
  * since a kprobe need not necessarily be at the beginning
  * of a function.
  */
-static void __init populate_kprobe_blacklist(struct kprobe_blackpoint **start,
-struct kprobe_blackpoint **end)
+static void populate_kprobe_blacklist(struct kprobe_blackpoint

[PATCH -tip RFC v2 06/22] kprobes/x86: Allow probe on some kprobe preparation functions

2013-11-14 Thread Masami Hiramatsu

There is no need to prohibit probing on the functions
used in preparation phase. Those are safely probed because
those are not invoked from breakpoint/fault/debug handlers,
there is no chance to cause recursive exceptions.

Following functions are now removed from the kprobes blacklist.
 can_boost
 can_probe
 can_optimize
 is_IF_modifier
 __copy_instruction
 copy_optimized_instructions
 arch_copy_kprobe
 arch_prepare_kprobe
 arch_arm_kprobe
 arch_disarm_kprobe
 arch_remove_kprobe
 arch_trampoline_kprobe
 arch_prepare_kprobe_ftrace
 arch_prepare_optimized_kprobe
 arch_check_optimized_kprobe
 arch_within_optimized_kprobe
 __arch_remove_optimized_kprobe
 arch_remove_optimized_kprobe
 arch_optimize_kprobes
 arch_unoptimize_kprobe

I tested the safety via kprobe-tracer as below;

 # cd /sys/kernel/debug/tracing
 # cat above-coverted-symbols-list | while read s; do
   echo "p $s"; done > kprobe_events
 (Note: some symbols are not found, those are inlined)
 # echo 1 > events/kprobes/enable
 # echo p:foo vfs_symlink >> kprobe_events
 # echo p:bar vfs_symlink+5 >> kprobe_events
 # echo p vfs_symlink+5 >> kprobe_events
 # echo 1 > events/kprobes/foo/enable
 # ln -sf /tmp/foo /tmp/bar
 # echo 0 > events/kprobes/foo/enable
 # echo -:foo >> kprobe_events
 # head -n 20 trace
 # echo 0 > events/kprobes/enable
 # echo > kprobe_events
 # echo > trace

Signed-off-by: Masami Hiramatsu 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: Steven Rostedt 
Cc: Andrew Morton 
---
 arch/x86/kernel/kprobes/core.c   |   20 ++--
 arch/x86/kernel/kprobes/ftrace.c |2 +-
 arch/x86/kernel/kprobes/opt.c|   24 
 3 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 79a3f96..7ccb3d3 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -159,7 +159,7 @@ static kprobe_opcode_t *__kprobes 
skip_prefixes(kprobe_opcode_t *insn)
  * Returns non-zero if opcode is boostable.
  * RIP relative instructions are adjusted at copying time in 64 bits mode
  */
-int __kprobes can_boost(kprobe_opcode_t *opcodes)
+int can_boost(kprobe_opcode_t *opcodes)
 {
kprobe_opcode_t opcode;
kprobe_opcode_t *orig_opcodes = opcodes;
@@ -260,7 +260,7 @@ unsigned long recover_probed_instruction(kprobe_opcode_t 
*buf, unsigned long add
 }
 
 /* Check if paddr is at an instruction boundary */
-static int __kprobes can_probe(unsigned long paddr)
+static int can_probe(unsigned long paddr)
 {
unsigned long addr, __addr, offset = 0;
struct insn insn;
@@ -299,7 +299,7 @@ static int __kprobes can_probe(unsigned long paddr)
 /*
  * Returns non-zero if opcode modifies the interrupt flag.
  */
-static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
+static int is_IF_modifier(kprobe_opcode_t *insn)
 {
/* Skip prefixes */
insn = skip_prefixes(insn);
@@ -322,7 +322,7 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
  * If not, return null.
  * Only applicable to 64-bit x86.
  */
-int __kprobes __copy_instruction(u8 *dest, u8 *src)
+int __copy_instruction(u8 *dest, u8 *src)
 {
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
@@ -365,7 +365,7 @@ int __kprobes __copy_instruction(u8 *dest, u8 *src)
return insn.length;
 }
 
-static int __kprobes arch_copy_kprobe(struct kprobe *p)
+static int arch_copy_kprobe(struct kprobe *p)
 {
int ret;
 
@@ -392,7 +392,7 @@ static int __kprobes arch_copy_kprobe(struct kprobe *p)
return 0;
 }
 
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
+int arch_prepare_kprobe(struct kprobe *p)
 {
if (alternatives_text_reserved(p->addr, p->addr))
return -EINVAL;
@@ -407,17 +407,17 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
return arch_copy_kprobe(p);
 }
 
-void __kprobes arch_arm_kprobe(struct kprobe *p)
+void arch_arm_kprobe(struct kprobe *p)
 {
text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
 }
 
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
+void arch_disarm_kprobe(struct kprobe *p)
 {
text_poke(p->addr, >opcode, 1);
 }
 
-void __kprobes arch_remove_kprobe(struct kprobe *p)
+void arch_remove_kprobe(struct kprobe *p)
 {
if (p->ainsn.insn) {
free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
@@ -1071,7 +1071,7 @@ int __init arch_init_kprobes(void)
return 0;
 }
 
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+int arch_trampoline_kprobe(struct kprobe *p)
 {
return 0;
 }
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 23ef5c5..dcaa131 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -85,7 +85,7 @@ end:
local_irq_restore(flags);
 }
 
-int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p)
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
 {

[PATCH -tip RFC v2 18/22] x86/dumpstack: Use NOKPROBE_SYMBOL macro in dumpstack.c

2013-11-14 Thread Masami Hiramatsu

Use NOKPROBE_SYMBOL macro for protecting functions
from kprobes instead of __kprobes annotation in
dumpstack.c.

Signed-off-by: Masami Hiramatsu 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: Andrew Morton 
Cc: Jiri Slaby 
Cc: Tejun Heo 
Cc: Vineet Gupta 
---
 arch/x86/kernel/dumpstack.c |9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index d9c12d3..b74ebc7 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -200,7 +200,7 @@ static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 static int die_owner = -1;
 static unsigned int die_nest_count;
 
-unsigned __kprobes long oops_begin(void)
+unsigned long oops_begin(void)
 {
int cpu;
unsigned long flags;
@@ -223,8 +223,9 @@ unsigned __kprobes long oops_begin(void)
return flags;
 }
 EXPORT_SYMBOL_GPL(oops_begin);
+NOKPROBE_SYMBOL(oops_begin);
 
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 {
if (regs && kexec_should_crash(current))
crash_kexec(regs);
@@ -247,8 +248,9 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs 
*regs, int signr)
panic("Fatal exception");
do_exit(signr);
 }
+NOKPROBE_SYMBOL(oops_end);
 
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+int __die(const char *str, struct pt_regs *regs, long err)
 {
 #ifdef CONFIG_X86_32
unsigned short ss;
@@ -291,6 +293,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, 
long err)
 #endif
return 0;
 }
+NOKPROBE_SYMBOL(__die);
 
 /*
  * This is gone through when something in the kernel has done something bad

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -tip RFC v2 22/22] sched: Use NOKPROBE_SYMBOL macro in sched

2013-11-14 Thread Masami Hiramatsu

Use NOKPROBE_SYMBOL macro to protect functions from
kprobes instead of __kprobes annotation in sched/core.c.

Signed-off-by: Masami Hiramatsu 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
---
 kernel/sched/core.c |6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 504fdbd..fece2e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2342,7 +2342,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
 
-void __kprobes preempt_count_add(int val)
+void preempt_count_add(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -2363,8 +2363,9 @@ void __kprobes preempt_count_add(int val)
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 EXPORT_SYMBOL(preempt_count_add);
+NOKPROBE_SYMBOL(preempt_count_add);
 
-void __kprobes preempt_count_sub(int val)
+void preempt_count_sub(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -2385,6 +2386,7 @@ void __kprobes preempt_count_sub(int val)
__preempt_count_sub(val);
 }
 EXPORT_SYMBOL(preempt_count_sub);
+NOKPROBE_SYMBOL(preempt_count_sub);
 
 #endif
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -tip RFC v2 16/22] x86/nmi: Use NOKPROBE_SYMBOL macro for nmi handlers

2013-11-14 Thread Masami Hiramatsu

Use NOKPROBE_SYMBOL macro to protect functions from
kprobes instead of __kprobes annotation for nmi handlers.

Signed-off-by: Masami Hiramatsu 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: Peter Zijlstra 
Cc: Paul Mackerras 
Cc: Arnaldo Carvalho de Melo 
Cc: Michel Lespinasse 
Cc: Dave Hansen 
Cc: Zhang Rui 
---
 arch/x86/kernel/apic/hw_nmi.c|3 ++-
 arch/x86/kernel/cpu/perf_event.c |3 ++-
 arch/x86/kernel/cpu/perf_event_amd_ibs.c |3 ++-
 arch/x86/kernel/nmi.c|   18 --
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index a698d71..73eb5b3 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -60,7 +60,7 @@ void arch_trigger_all_cpu_backtrace(void)
smp_mb__after_clear_bit();
 }
 
-static int __kprobes
+static int
 arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
 {
int cpu;
@@ -80,6 +80,7 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, 
struct pt_regs *regs)
 
return NMI_DONE;
 }
+NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler);
 
 static int __init register_trigger_all_cpu_backtrace(void)
 {
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 98f845b..396c1a2 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1273,7 +1273,7 @@ void perf_events_lapic_init(void)
apic_write(APIC_LVTPC, APIC_DM_NMI);
 }
 
-static int __kprobes
+static int
 perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 {
u64 start_clock;
@@ -1291,6 +1291,7 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs 
*regs)
 
return ret;
 }
+NOKPROBE_SYMBOL(perf_event_nmi_handler);
 
 struct event_constraint emptyconstraint;
 struct event_constraint unconstrained;
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c 
b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index e09f0bf..c668309 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -592,7 +592,7 @@ out:
return 1;
 }
 
-static int __kprobes
+static int
 perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 {
int handled = 0;
@@ -605,6 +605,7 @@ perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 
return handled;
 }
+NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
 
 static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
 {
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 6fcb49c..38ce829 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -95,7 +95,7 @@ static int __init nmi_warning_debugfs(void)
 }
 fs_initcall(nmi_warning_debugfs);
 
-static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool 
b2b)
+static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
 {
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *a;
@@ -137,6 +137,7 @@ static int __kprobes nmi_handle(unsigned int type, struct 
pt_regs *regs, bool b2
/* return total number of NMI events handled */
return handled;
 }
+NOKPROBE_SYMBOL(nmi_handle);
 
 int __register_nmi_handler(unsigned int type, struct nmiaction *action)
 {
@@ -197,7 +198,7 @@ void unregister_nmi_handler(unsigned int type, const char 
*name)
 }
 EXPORT_SYMBOL_GPL(unregister_nmi_handler);
 
-static __kprobes void
+static void
 pci_serr_error(unsigned char reason, struct pt_regs *regs)
 {
/* check to see if anyone registered against these types of errors */
@@ -227,8 +228,9 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
outb(reason, NMI_REASON_PORT);
 }
+NOKPROBE_SYMBOL(pci_serr_error);
 
-static __kprobes void
+static void
 io_check_error(unsigned char reason, struct pt_regs *regs)
 {
unsigned long i;
@@ -258,8 +260,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
reason &= ~NMI_REASON_CLEAR_IOCHK;
outb(reason, NMI_REASON_PORT);
 }
+NOKPROBE_SYMBOL(io_check_error);
 
-static __kprobes void
+static void
 unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 {
int handled;
@@ -287,11 +290,12 @@ unknown_nmi_error(unsigned char reason, struct pt_regs 
*regs)
 
pr_emerg("Dazed and confused, but trying to continue\n");
 }
+NOKPROBE_SYMBOL(unknown_nmi_error);
 
 static DEFINE_PER_CPU(bool, swallow_nmi);
 static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
 
-static __kprobes void default_do_nmi(struct pt_regs *regs)
+static void default_do_nmi(struct pt_regs *regs)
 {
unsigned char reason = 0;
int handled;
@@ -390,6 +394,7 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)
else
unknown_nmi_error(reason, regs);
 }
+NOKPROBE_SYMBOL(default_do_nmi);
 
 /*
  * NMIs can hit

linux-next: manual merge of the akpm tree with the sparc-next tree

2013-11-14 Thread Stephen Rothwell

Hi Andrew,

Today's linux-next merge of the akpm tree got a conflict in
arch/sparc/mm/init_64.c between commit 37b3a8ff3e08 ("sparc64: Move from
4MB to 8MB huge pages") from the sparc-next tree and commit "sparc:
handle pgtable_page_ctor() fail" from the akpm tree.

I fixed it up (see below) and can carry the fix as necessary (no action
is required).

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au

diff --cc arch/sparc/mm/init_64.c
index bd6430ded69f,d6de9353ee11..
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@@ -2563,16 -2511,21 +2563,16 @@@ pte_t *pte_alloc_one_kernel(struct mm_s
  pgtable_t pte_alloc_one(struct mm_struct *mm,
unsigned long address)
  {
 -  struct page *page;
 -  pte_t *pte;
 -
 -  pte = get_from_cache(mm);
 -  if (pte)
 -  return pte;
 +  struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
 + __GFP_REPEAT | __GFP_ZERO);
-   pte_t *pte = NULL;
  
-   if (page) {
-   pgtable_page_ctor(page);
-   pte = (pte_t *) page_address(page);
 -  page = __alloc_for_cache(mm);
+   if (!page)
+   return NULL;
+   if (!pgtable_page_ctor(page)) {
+   free_hot_cold_page(page, 0);
+   return NULL;
}
- 
-   return pte;
+   return (pte_t *) page_address(page);
  }
  
  void pte_free_kernel(struct mm_struct *mm, pte_t *pte)


pgp0e2MyQl3Ec.pgp
Description: PGP signature

[PATCH -tip RFC v2 15/22] x86/alternative: Use NOKPROBE_SYMBOL macro in alternative.c

2013-11-14 Thread Masami Hiramatsu

Use NOKPROBE_SYMBOL macro to protect functions from kprobes
instead of __kprobes annotation in alternative.c.

Signed-off-by: Masami Hiramatsu 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: Jiri Kosina 
Cc: Borislav Petkov 
---
 arch/x86/kernel/alternative.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index df94598..7cfd6d7 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -551,7 +551,7 @@ void *__init_or_module text_poke_early(void *addr, const 
void *opcode,
  *
  * Note: Must be called under text_mutex.
  */
-void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
+void *text_poke(void *addr, const void *opcode, size_t len)
 {
unsigned long flags;
char *vaddr;
@@ -585,6 +585,7 @@ void *__kprobes text_poke(void *addr, const void *opcode, 
size_t len)
local_irq_restore(flags);
return addr;
 }
+NOKPROBE_SYMBOL(text_poke);
 
 static void do_sync_core(void *info)
 {

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -tip RFC v2 14/22] x86/fault: Use NOKPROBE_SYMBOL macro in fault.c

2013-11-14 Thread Masami Hiramatsu

Use NOKPROBE_SYMBOL macro to protect functions from kprobes
instead of __kprobes annotation in fault.c.
This applies __always_inline annotation for some cases,
because NOKPROBE_SYMBOL() will inhibit inlining by
referring the symbol address.

Signed-off-by: Masami Hiramatsu 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Seiji Aguchi 
---
 arch/x86/mm/fault.c |   28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9ff85bb..7c9305c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -8,7 +8,7 @@
 #include   /* oops_begin/end, ...  */
 #include   /* search_exception_table   */
 #include  /* max_low_pfn  */
-#include  /* __kprobes, ...   */
+#include  /* NOKPROBE_SYMBOL, ... */
 #include/* kmmio_handler, ...   */
 #include   /* perf_sw_event*/
 #include  /* hstate_index_to_shift*/
@@ -45,7 +45,7 @@ enum x86_pf_error_code {
  * Returns 0 if mmiotrace is disabled, or if the fault is not
  * handled by mmiotrace:
  */
-static inline int __kprobes
+static __always_inline int
 kmmio_fault(struct pt_regs *regs, unsigned long addr)
 {
if (unlikely(is_kmmio_active()))
@@ -54,7 +54,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
return 0;
 }
 
-static inline int __kprobes kprobes_fault(struct pt_regs *regs)
+static __always_inline int kprobes_fault(struct pt_regs *regs)
 {
int ret = 0;
 
@@ -261,7 +261,7 @@ void vmalloc_sync_all(void)
  *
  *   Handle a fault on the vmalloc or module mapping area
  */
-static noinline __kprobes int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
 {
unsigned long pgd_paddr;
pmd_t *pmd_k;
@@ -291,6 +291,7 @@ static noinline __kprobes int vmalloc_fault(unsigned long 
address)
 
return 0;
 }
+NOKPROBE_SYMBOL(vmalloc_fault);
 
 /*
  * Did it hit the DOS screen memory VA from vm86 mode?
@@ -358,7 +359,7 @@ void vmalloc_sync_all(void)
  *
  * This assumes no large pages in there.
  */
-static noinline __kprobes int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
 {
pgd_t *pgd, *pgd_ref;
pud_t *pud, *pud_ref;
@@ -425,6 +426,7 @@ static noinline __kprobes int vmalloc_fault(unsigned long 
address)
 
return 0;
 }
+NOKPROBE_SYMBOL(vmalloc_fault);
 
 #ifdef CONFIG_CPU_SUP_AMD
 static const char errata93_warning[] =
@@ -904,7 +906,7 @@ static int spurious_fault_check(unsigned long error_code, 
pte_t *pte)
  * There are no security implications to leaving a stale TLB when
  * increasing the permissions on a page.
  */
-static noinline __kprobes int
+static noinline int
 spurious_fault(unsigned long error_code, unsigned long address)
 {
pgd_t *pgd;
@@ -952,6 +954,7 @@ spurious_fault(unsigned long error_code, unsigned long 
address)
 
return ret;
 }
+NOKPROBE_SYMBOL(spurious_fault);
 
 int show_unhandled_signals = 1;
 
@@ -997,7 +1000,7 @@ static inline bool smap_violation(int error_code, struct 
pt_regs *regs)
  * and the problem, and then passes it off to one of the appropriate
  * routines.
  */
-static void __kprobes
+static void
 __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
struct vm_area_struct *vma;
@@ -1225,8 +1228,9 @@ good_area:
 
up_read(>mmap_sem);
 }
+NOKPROBE_SYMBOL(__do_page_fault);
 
-dotraplinkage void __kprobes
+dotraplinkage void
 do_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
enum ctx_state prev_state;
@@ -1235,9 +1239,10 @@ do_page_fault(struct pt_regs *regs, unsigned long 
error_code)
__do_page_fault(regs, error_code);
exception_exit(prev_state);
 }
+NOKPROBE_SYMBOL(do_page_fault);
 
-static void trace_page_fault_entries(struct pt_regs *regs,
-unsigned long error_code)
+static __always_inline void
+trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code)
 {
if (user_mode(regs))
trace_page_fault_user(read_cr2(), regs, error_code);
@@ -1245,7 +1250,7 @@ static void trace_page_fault_entries(struct pt_regs *regs,
trace_page_fault_kernel(read_cr2(), regs, error_code);
 }
 
-dotraplinkage void __kprobes
+dotraplinkage void
 trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
enum ctx_state prev_state;
@@ -1255,3 +1260,4 @@ trace_do_page_fault(struct pt_regs *regs, unsigned long 
error_code)
__do_page_fault(regs, error_code);
exception_exit(prev_state);
 }
+NOKPROBE_SYMBOL(trace_do_page_fault);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at

[PATCH -tip RFC v2 20/22] [BUGFIX] kprobes: Prohibit probing on func_ptr_is_kernel_text

2013-11-14 Thread Masami Hiramatsu

Prohibit probing on func_ptr_is_kernel_text() by adding
it to the kprobe_blacklist.

Since the func_ptr_is_kernel_text() is called from
notifier_call_chain() which is called from int3 handler,
probing it may cause double int3 fault and kernel will
reboot.

This happenes when the kernel built with CONFIG_DEBUG_NOTIFIERS=y.

Signed-off-by: Masami Hiramatsu 
Cc: Andrew Morton 
Cc: "Uwe Kleine-König" 
Cc: Borislav Petkov 
Cc: Ingo Molnar 
---
 kernel/extable.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/extable.c b/kernel/extable.c
index 832cb28..885c877 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -137,3 +138,4 @@ int func_ptr_is_kernel_text(void *ptr)
return 1;
return is_module_text_address(addr);
 }
+NOKPROBE_SYMBOL(func_ptr_is_kernel_text);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -tip RFC v2 10/22] ftrace/kprobes: Allow probing on some preparation functions

2013-11-14 Thread Masami Hiramatsu

There is no need to prohibit probing on the functions
used for preparation. Those are safely probed because
those are not invoked from breakpoint/fault/debug handlers,
there is no chance to cause recursive exceptions.

Following functions are now removed from the kprobes blacklist.
update_bitfield_fetch_param
free_bitfield_fetch_param
kprobe_register

Signed-off-by: Masami Hiramatsu 
Cc: Steven Rostedt 
Cc: Frederic Weisbecker 
Cc: Ingo Molnar 
---
 kernel/trace/trace_kprobe.c |2 +-
 kernel/trace/trace_probe.c  |4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 243f683..e0132b4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1151,7 +1151,7 @@ kretprobe_perf_func(struct trace_probe *tp, struct 
kretprobe_instance *ri,
  * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
  * lockless, but we can't race with this __init function.
  */
-static __kprobes
+static
 int kprobe_register(struct ftrace_event_call *event,
enum trace_reg type, void *data)
 {
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 412e959..43638a2 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -346,7 +346,7 @@ DEFINE_BASIC_FETCH_FUNCS(bitfield)
 #define fetch_bitfield_string  NULL
 #define fetch_bitfield_string_size NULL
 
-static __kprobes void
+static void
 update_bitfield_fetch_param(struct bitfield_fetch_param *data)
 {
/*
@@ -359,7 +359,7 @@ update_bitfield_fetch_param(struct bitfield_fetch_param 
*data)
update_symbol_cache(data->orig.data);
 }
 
-static __kprobes void
+static void
 free_bitfield_fetch_param(struct bitfield_fetch_param *data)
 {
/*

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -tip RFC v2 08/22] kprobes: Allow probe on some kprobe functions

2013-11-14 Thread Masami Hiramatsu

There is no need to prohibit probing on the functions
used for preparation, registeration, optimization,
controll etc. Those are safely probed because those are
not invoked from breakpoint/fault/debug handlers,
there is no chance to cause recursive exceptions.

Following functions are now removed from the kprobes blacklist.
add_new_kprobe
aggr_kprobe_disabled
alloc_aggr_kprobe
alloc_aggr_kprobe
arm_all_kprobes
__arm_kprobe
arm_kprobe
arm_kprobe_ftrace
check_kprobe_address_safe
collect_garbage_slots
collect_garbage_slots
collect_one_slot
debugfs_kprobe_init
__disable_kprobe
disable_kprobe
disarm_all_kprobes
__disarm_kprobe
disarm_kprobe
disarm_kprobe_ftrace
do_free_cleaned_kprobes
do_optimize_kprobes
do_unoptimize_kprobes
enable_kprobe
force_unoptimize_kprobe
free_aggr_kprobe
free_aggr_kprobe
__free_insn_slot
__get_insn_slot
get_optimized_kprobe
__get_valid_kprobe
init_aggr_kprobe
init_aggr_kprobe
in_nokprobe_functions
kick_kprobe_optimizer
kill_kprobe
kill_optimized_kprobe
kprobe_addr
kprobe_optimizer
kprobe_queued
kprobe_seq_next
kprobe_seq_start
kprobe_seq_stop
kprobes_module_callback
kprobes_open
optimize_all_kprobes
optimize_kprobe
prepare_kprobe
prepare_optimized_kprobe
register_aggr_kprobe
register_jprobe
register_jprobes
register_kprobe
register_kprobes
register_kretprobe
register_kretprobe
register_kretprobes
register_kretprobes
report_probe
show_kprobe_addr
try_to_optimize_kprobe
unoptimize_all_kprobes
unoptimize_kprobe
unregister_jprobe
unregister_jprobes
unregister_kprobe
__unregister_kprobe_bottom
unregister_kprobes
__unregister_kprobe_top
unregister_kretprobe
unregister_kretprobe
unregister_kretprobes
unregister_kretprobes
wait_for_kprobe_optimizer

Signed-off-by: Masami Hiramatsu 
Cc: Ananth N Mavinakayanahalli 
Cc: "David S. Miller" 
---
 kernel/kprobes.c |  153 +++---
 1 file changed, 76 insertions(+), 77 deletions(-)

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 4e8ce87..9345bc7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -139,13 +139,13 @@ struct kprobe_insn_cache kprobe_insn_slots = {
.insn_size = MAX_INSN_SIZE,
.nr_garbage = 0,
 };
-static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
+static int collect_garbage_slots(struct kprobe_insn_cache *c);
 
 /**
  * __get_insn_slot() - Find a slot on an executable page for an instruction.
  * We allocate an executable page if there's no room on existing ones.
  */
-kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
+kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 {
struct kprobe_insn_page *kip;
kprobe_opcode_t *slot = NULL;
@@ -202,7 +202,7 @@ out:
 }
 
 /* Return 1 if all garbages are collected, otherwise 0. */
-static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
+static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
 {
kip->slot_used[idx] = SLOT_CLEAN;
kip->nused--;
@@ -223,7 +223,7 @@ static int __kprobes collect_one_slot(struct 
kprobe_insn_page *kip, int idx)
return 0;
 }
 
-static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
+static int collect_garbage_slots(struct kprobe_insn_cache *c)
 {
struct kprobe_insn_page *kip, *next;
 
@@ -245,8 +245,8 @@ static int __kprobes collect_garbage_slots(struct 
kprobe_insn_cache *c)
return 0;
 }
 
-void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
-   kprobe_opcode_t *slot, int dirty)
+void __free_insn_slot(struct kprobe_insn_cache *c,
+ kprobe_opcode_t *slot, int dirty)
 {
struct kprobe_insn_page *kip;
 
@@ -362,7 +362,7 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct 
pt_regs *regs)
 }
 
 /* Free optimized instructions and optimized_kprobe */
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
+static void free_aggr_kprobe(struct kprobe *p)
 {
struct optimized_kprobe *op;
 
@@ -400,7 +400,7 @@ static inline int kprobe_disarmed(struct kprobe *p)
 }
 
 /* Return true(!0) if the probe is queued on (un)optimizing lists */
-static int __kprobes kprobe_queued(struct kprobe *p)
+static int kprobe_queued(struct kprobe *p)
 {
struct optimized_kprobe *op;
 
@@ -416,7 +416,7 @@ static int __kprobes kprobe_queued(struct kprobe *p)
  * Return an optimized kprobe whose optimizing code replaces
  * instructions including addr (exclude breakpoint).
  */
-static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+static struct kprobe *get_optimized_kprobe(unsigned long addr)
 {
int i;
struct kprobe *p = NULL;
@@ -448,7 +448,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, 
kprobe_optimizer);
  * Optimize (replace a breakpoint with a jump) kprobes listed on
  * optimizing_list.
  */
-static __kprobes void do_optimize_kprobes(void)
+static void do_optimize_kprobes(void)
 {
/* Optimization never be done when disarmed */
if

[PATCH -tip RFC v2 19/22] [BUGFIX] kprobes/x86: Prohibit probing on debug_stack_*

2013-11-14 Thread Masami Hiramatsu

Prohibit probing on debug_stack_reset and debug_stack_set_zero.
Since the both functions are called from TRACE_IRQS_ON/OFF_DEBUG
macros which run in int3 ist entry, probing it may cause a soft
lockup.

This happens when the kernel built with CONFIG_DYNAMIC_FTRACE=y
and CONFIG_TRACE_IRQFLAGS=y.

Signed-off-by: Masami Hiramatsu 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: Borislav Petkov 
Cc: Fenghua Yu 
Cc: Seiji Aguchi 
---
 arch/x86/kernel/cpu/common.c |4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1789b06..d0a802a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1163,6 +1164,7 @@ int is_debug_stack(unsigned long addr)
(addr <= __get_cpu_var(debug_stack_addr) &&
 addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
 }
+NOKPROBE_SYMBOL(is_debug_stack);
 
 DEFINE_PER_CPU(u32, debug_idt_ctr);
 
@@ -1171,6 +1173,7 @@ void debug_stack_set_zero(void)
this_cpu_inc(debug_idt_ctr);
load_current_idt();
 }
+NOKPROBE_SYMBOL(debug_stack_set_zero);
 
 void debug_stack_reset(void)
 {
@@ -1179,6 +1182,7 @@ void debug_stack_reset(void)
if (this_cpu_dec_return(debug_idt_ctr) == 0)
load_current_idt();
 }
+NOKPROBE_SYMBOL(debug_stack_reset);
 
 #else  /* CONFIG_X86_64 */
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -tip RFC v2 07/22] kprobes/x86: Use NOKPROBE_SYMBOL instead of __kprobes

2013-11-14 Thread Masami Hiramatsu

Use NOKPROBE_SYMBOL macro for protecting functions
from kprobes instead of __kprobes annotation in x86
kprobes code.
This applies __always_inline annotation for some cases,
because NOKPROBE_SYMBOL() will inhibit inlining by
referring the symbol address.

Signed-off-by: Masami Hiramatsu 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: Steven Rostedt 
Cc: Andrew Morton 
---
 arch/x86/kernel/kprobes/core.c   |   77 --
 arch/x86/kernel/kprobes/ftrace.c |   15 ---
 arch/x86/kernel/kprobes/opt.c|8 ++--
 3 files changed, 63 insertions(+), 37 deletions(-)

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 7ccb3d3..6cfa5d3 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -112,7 +112,8 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
 
 const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
 
-static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
+static __always_inline
+void __synthesize_relative_insn(void *from, void *to, u8 op)
 {
struct __arch_relative_insn {
u8 op;
@@ -125,21 +126,23 @@ static void __kprobes __synthesize_relative_insn(void 
*from, void *to, u8 op)
 }
 
 /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-void __kprobes synthesize_reljump(void *from, void *to)
+void synthesize_reljump(void *from, void *to)
 {
__synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
 }
+NOKPROBE_SYMBOL(synthesize_reljump);
 
 /* Insert a call instruction at address 'from', which calls address 'to'.*/
-void __kprobes synthesize_relcall(void *from, void *to)
+void synthesize_relcall(void *from, void *to)
 {
__synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
 }
+NOKPROBE_SYMBOL(synthesize_relcall);
 
 /*
  * Skip the prefixes of the instruction.
  */
-static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
+static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
 {
insn_attr_t attr;
 
@@ -154,6 +157,7 @@ static kprobe_opcode_t *__kprobes 
skip_prefixes(kprobe_opcode_t *insn)
 #endif
return insn;
 }
+NOKPROBE_SYMBOL(skip_prefixes);
 
 /*
  * Returns non-zero if opcode is boostable.
@@ -425,7 +429,8 @@ void arch_remove_kprobe(struct kprobe *p)
}
 }
 
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+static __always_inline
+void save_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
kcb->prev_kprobe.kp = kprobe_running();
kcb->prev_kprobe.status = kcb->kprobe_status;
@@ -433,7 +438,8 @@ static void __kprobes save_previous_kprobe(struct 
kprobe_ctlblk *kcb)
kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
 }
 
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+static __always_inline
+void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
kcb->kprobe_status = kcb->prev_kprobe.status;
@@ -441,8 +447,9 @@ static void __kprobes restore_previous_kprobe(struct 
kprobe_ctlblk *kcb)
kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
 }
 
-static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs 
*regs,
-   struct kprobe_ctlblk *kcb)
+static __always_inline
+void set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+   struct kprobe_ctlblk *kcb)
 {
__this_cpu_write(current_kprobe, p);
kcb->kprobe_saved_flags = kcb->kprobe_old_flags
@@ -451,7 +458,7 @@ static void __kprobes set_current_kprobe(struct kprobe *p, 
struct pt_regs *regs,
kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
 }
 
-static void __kprobes clear_btf(void)
+static __always_inline void clear_btf(void)
 {
if (test_thread_flag(TIF_BLOCKSTEP)) {
unsigned long debugctl = get_debugctlmsr();
@@ -461,7 +468,7 @@ static void __kprobes clear_btf(void)
}
 }
 
-static void __kprobes restore_btf(void)
+static __always_inline void restore_btf(void)
 {
if (test_thread_flag(TIF_BLOCKSTEP)) {
unsigned long debugctl = get_debugctlmsr();
@@ -471,8 +478,7 @@ static void __kprobes restore_btf(void)
}
 }
 
-void __kprobes
-arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
+void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs 
*regs)
 {
unsigned long *sara = stack_addr(regs);
 
@@ -481,9 +487,10 @@ arch_prepare_kretprobe(struct kretprobe_instance *ri, 
struct pt_regs *regs)
/* Replace the return addr with trampoline addr */
*sara = (unsigned long) _trampoline;
 }
+NOKPROBE_SYMBOL(arch_prepare_kretprobe);
 
-static void __kprobes
-setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk 
*kcb, int reenter)
+static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
+

[PATCH -tip RFC v2 03/22] kprobes: Show blacklist entries via debugfs

2013-11-14 Thread Masami Hiramatsu

Show blacklist entries (function names with the address
range) via /sys/kernel/debug/kprobes/blacklist.

Signed-off-by: Masami Hiramatsu 
Cc: Ananth N Mavinakayanahalli 
Cc: "David S. Miller" 
---
 kernel/kprobes.c |   61 +++---
 1 file changed, 53 insertions(+), 8 deletions(-)

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a3b323e..007235e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2224,6 +2224,46 @@ static const struct file_operations 
debugfs_kprobes_operations = {
.release= seq_release,
 };
 
+/* kprobes/blacklist -- shows which functions can not be probed */
+static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos)
+{
+   return seq_list_start(_blacklist, *pos);
+}
+
+static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t 
*pos)
+{
+   return seq_list_next(v, _blacklist, pos);
+}
+
+static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
+{
+   struct kprobe_blackpoint *bp =
+   list_entry(v, struct kprobe_blackpoint, list);
+
+   seq_printf(m, "0x%p-0x%p\t%s\n", (void *)bp->start_addr,
+  (void *)(bp->start_addr + bp->range), bp->name);
+   return 0;
+}
+
+static const struct seq_operations kprobe_blacklist_seq_ops = {
+   .start = kprobe_blacklist_seq_start,
+   .next  = kprobe_blacklist_seq_next,
+   .stop  = kprobe_seq_stop,   /* Reuse void function */
+   .show  = kprobe_blacklist_seq_show,
+};
+
+static int kprobe_blacklist_open(struct inode *inode, struct file *filp)
+{
+   return seq_open(filp, _blacklist_seq_ops);
+}
+
+static const struct file_operations debugfs_kprobe_blacklist_ops = {
+   .open   = kprobe_blacklist_open,
+   .read   = seq_read,
+   .llseek = seq_lseek,
+   .release= seq_release,
+};
+
 static void __kprobes arm_all_kprobes(void)
 {
struct hlist_head *head;
@@ -2347,19 +2387,24 @@ static int __kprobes debugfs_kprobe_init(void)
 
file = debugfs_create_file("list", 0444, dir, NULL,
_kprobes_operations);
-   if (!file) {
-   debugfs_remove(dir);
-   return -ENOMEM;
-   }
+   if (!file)
+   goto error;
 
file = debugfs_create_file("enabled", 0600, dir,
, _kp);
-   if (!file) {
-   debugfs_remove(dir);
-   return -ENOMEM;
-   }
+   if (!file)
+   goto error;
+
+   file = debugfs_create_file("blacklist", 0444, dir, NULL,
+   _kprobe_blacklist_ops);
+   if (!file)
+   goto error;
 
return 0;
+
+error:
+   debugfs_remove(dir);
+   return -ENOMEM;
 }
 
 late_initcall(debugfs_kprobe_init);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -tip RFC v2 21/22] notifier: Use NOKPROBE_SYMBOL macro in notifier

2013-11-14 Thread Masami Hiramatsu

Use NOKPROBE_SYMBOL macro to protect functions from
kprobes instead of __kprobes annotation in notifier.

Signed-off-by: Masami Hiramatsu 
---
 kernel/notifier.c |   22 +-
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2d5cc4c..61fc78a 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -71,9 +71,9 @@ static int notifier_chain_unregister(struct notifier_block 
**nl,
  * @returns:   notifier_call_chain returns the value returned by the
  * last notifier function called.
  */
-static int __kprobes notifier_call_chain(struct notifier_block **nl,
-   unsigned long val, void *v,
-   int nr_to_call, int *nr_calls)
+static int notifier_call_chain(struct notifier_block **nl,
+  unsigned long val, void *v,
+  int nr_to_call, int *nr_calls)
 {
int ret = NOTIFY_DONE;
struct notifier_block *nb, *next_nb;
@@ -102,6 +102,7 @@ static int __kprobes notifier_call_chain(struct 
notifier_block **nl,
}
return ret;
 }
+NOKPROBE_SYMBOL(notifier_call_chain);
 
 /*
  * Atomic notifier chain routines.  Registration and unregistration
@@ -172,9 +173,9 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
  * Otherwise the return value is the return value
  * of the last notifier function called.
  */
-int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
-   unsigned long val, void *v,
-   int nr_to_call, int *nr_calls)
+int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+unsigned long val, void *v,
+int nr_to_call, int *nr_calls)
 {
int ret;
 
@@ -184,13 +185,15 @@ int __kprobes __atomic_notifier_call_chain(struct 
atomic_notifier_head *nh,
return ret;
 }
 EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
+NOKPROBE_SYMBOL(__atomic_notifier_call_chain);
 
-int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
-   unsigned long val, void *v)
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+  unsigned long val, void *v)
 {
return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
 }
 EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+NOKPROBE_SYMBOL(atomic_notifier_call_chain);
 
 /*
  * Blocking notifier chain routines.  All access to the chain is
@@ -527,7 +530,7 @@ EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
 
 static ATOMIC_NOTIFIER_HEAD(die_chain);
 
-int notrace __kprobes notify_die(enum die_val val, const char *str,
+int notrace notify_die(enum die_val val, const char *str,
   struct pt_regs *regs, long err, int trap, int sig)
 {
struct die_args args = {
@@ -540,6 +543,7 @@ int notrace __kprobes notify_die(enum die_val val, const 
char *str,
};
return atomic_notifier_call_chain(_chain, val, );
 }
+NOKPROBE_SYMBOL(notify_die);
 
 int register_die_notifier(struct notifier_block *nb)
 {

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -tip RFC v2 11/22] ftrace/kprobes: Use NOKPROBE_SYMBOL macro in ftrace

2013-11-14 Thread Masami Hiramatsu

Use NOKPROBE_SYMBOL macro to protect functions from
kprobes instead of __kprobes annotation in ftrace.
This applies __always_inline annotation for some cases,
because NOKPROBE_SYMBOL() will inhibit inlining by
referring the symbol address.

Signed-off-by: Masami Hiramatsu 
Cc: Steven Rostedt 
Cc: Frederic Weisbecker 
Cc: Ingo Molnar 
---
 kernel/trace/trace_event_perf.c |5 ++-
 kernel/trace/trace_kprobe.c |   51 +++
 kernel/trace/trace_probe.c  |   74 +++
 kernel/trace/trace_probe.h  |4 +-
 4 files changed, 76 insertions(+), 58 deletions(-)

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 78e27e3..25d8903 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -226,8 +226,8 @@ void perf_trace_del(struct perf_event *p_event, int flags)
tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
 }
 
-__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
-  struct pt_regs *regs, int *rctxp)
+void *perf_trace_buf_prepare(int size, unsigned short type,
+struct pt_regs *regs, int *rctxp)
 {
struct trace_entry *entry;
unsigned long flags;
@@ -259,6 +259,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned 
short type,
return raw_data;
 }
 EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
+NOKPROBE_SYMBOL(perf_trace_buf_prepare);
 
 #ifdef CONFIG_FUNCTION_TRACER
 static void
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index e0132b4..2f19ea6 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -51,45 +51,45 @@ struct event_file_link {
(sizeof(struct probe_arg) * (n)))
 
 
-static __kprobes bool trace_probe_is_return(struct trace_probe *tp)
+static __always_inline bool trace_probe_is_return(struct trace_probe *tp)
 {
return tp->rp.handler != NULL;
 }
 
-static __kprobes const char *trace_probe_symbol(struct trace_probe *tp)
+static __always_inline const char *trace_probe_symbol(struct trace_probe *tp)
 {
return tp->symbol ? tp->symbol : "unknown";
 }
 
-static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp)
+static __always_inline unsigned long trace_probe_offset(struct trace_probe *tp)
 {
return tp->rp.kp.offset;
 }
 
-static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp)
+static __always_inline bool trace_probe_is_enabled(struct trace_probe *tp)
 {
return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
 }
 
-static __kprobes bool trace_probe_is_registered(struct trace_probe *tp)
+static __always_inline bool trace_probe_is_registered(struct trace_probe *tp)
 {
return !!(tp->flags & TP_FLAG_REGISTERED);
 }
 
-static __kprobes bool trace_probe_has_gone(struct trace_probe *tp)
+static __always_inline bool trace_probe_has_gone(struct trace_probe *tp)
 {
return !!(kprobe_gone(>rp.kp));
 }
 
-static __kprobes bool trace_probe_within_module(struct trace_probe *tp,
-   struct module *mod)
+static __always_inline bool trace_probe_within_module(struct trace_probe *tp,
+ struct module *mod)
 {
int len = strlen(mod->name);
const char *name = trace_probe_symbol(tp);
return strncmp(mod->name, name, len) == 0 && name[len] == ':';
 }
 
-static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp)
+static __always_inline bool trace_probe_is_on_module(struct trace_probe *tp)
 {
return !!strchr(trace_probe_symbol(tp), ':');
 }
@@ -755,8 +755,8 @@ static const struct file_operations kprobe_profile_ops = {
 };
 
 /* Sum up total data length for dynamic arraies (strings) */
-static __kprobes int __get_data_size(struct trace_probe *tp,
-struct pt_regs *regs)
+static __always_inline
+int __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
 {
int i, ret = 0;
u32 len;
@@ -771,9 +771,9 @@ static __kprobes int __get_data_size(struct trace_probe *tp,
 }
 
 /* Store the value of each argument */
-static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
-  struct pt_regs *regs,
-  u8 *data, int maxlen)
+static __always_inline
+void store_trace_args(int ent_size, struct trace_probe *tp,
+ struct pt_regs *regs, u8 *data, int maxlen)
 {
int i;
u32 end = tp->size;
@@ -803,7 +803,7 @@ static __kprobes void store_trace_args(int ent_size, struct 
trace_probe *tp,
 }
 
 /* Kprobe handler */
-static __kprobes void
+static __always_inline void
 __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
struct ftrace_event_file *ftrace_file)
 {
@@ -840,7 +840,7 @@ __kprobe_trace_func(struct trace_probe

[PATCH -tip RFC v2 17/22] x86/kvm: Use NOKPROBE_SYMBOL macro in kvm.c

2013-11-14 Thread Masami Hiramatsu

Use NOKPROBE_SYMBOL macro for protecting functions
from kprobes instead of __kprobes annotation in kvm.c.
This also adds kvm_read_and_reset_pf_reason in
the blacklist because it can be called before
do_page_fault.

Signed-off-by: Masami Hiramatsu 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: Gleb Natapov 
Cc: Raghavendra K T 
Cc: Marcelo Tosatti 
---
 arch/x86/kernel/kvm.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 6dd802c..fb95987 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -251,8 +251,9 @@ u32 kvm_read_and_reset_pf_reason(void)
return reason;
 }
 EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
+NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
 
-dotraplinkage void __kprobes
+dotraplinkage void
 do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
enum ctx_state prev_state;
@@ -276,6 +277,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long 
error_code)
break;
}
 }
+NOKPROBE_SYMBOL(do_async_page_fault);
 
 static void __init paravirt_ops_setup(void)
 {

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -tip RFC v2 13/22] x86/trap: Use NOKPROBE_SYMBOL macro in trap.c

2013-11-14 Thread Masami Hiramatsu

Use NOKPROBE_SYMBOL macro to protect functions from kprobes
instead of __kprobes annotation in trap.c.
This also applies __always_inline annotation for some cases,
because NOKPROBE_SYMBOL() will inhibit inlining by referring
the symbol address.

Signed-off-by: Masami Hiramatsu 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: Andi Kleen 
Cc: Seiji Aguchi 
Cc: Frederic Weisbecker 
---
 arch/x86/include/asm/traps.h |2 +-
 arch/x86/kernel/traps.c  |   20 +---
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 58d66fe..ca32508 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -68,7 +68,7 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, 
long);
 dotraplinkage void do_stack_segment(struct pt_regs *, long);
 #ifdef CONFIG_X86_64
 dotraplinkage void do_double_fault(struct pt_regs *, long);
-asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *);
+asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
 #endif
 dotraplinkage void do_general_protection(struct pt_regs *, long);
 dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ce24c24..e751e3b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -106,7 +106,7 @@ static inline void preempt_conditional_cli(struct pt_regs 
*regs)
preempt_count_dec();
 }
 
-static int __kprobes
+static __always_inline int
 do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
  struct pt_regs *regs, long error_code)
 {
@@ -136,7 +136,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char 
*str,
return -1;
 }
 
-static void __kprobes
+static void
 do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
long error_code, siginfo_t *info)
 {
@@ -173,6 +173,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs 
*regs,
else
force_sig(signr, tsk);
 }
+NOKPROBE_SYMBOL(do_trap);
 
 #define DO_ERROR(trapnr, signr, str, name) \
 dotraplinkage void do_##name(struct pt_regs *regs, long error_code)\
@@ -267,7 +268,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, 
long error_code)
 }
 #endif
 
-dotraplinkage void __kprobes
+dotraplinkage void
 do_general_protection(struct pt_regs *regs, long error_code)
 {
struct task_struct *tsk;
@@ -313,9 +314,10 @@ do_general_protection(struct pt_regs *regs, long 
error_code)
 exit:
exception_exit(prev_state);
 }
+NOKPROBE_SYMBOL(do_general_protection);
 
 /* May run on IST stack. */
-dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long 
error_code)
+dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 {
enum ctx_state prev_state;
 
@@ -354,6 +356,7 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs 
*regs, long error_co
 exit:
exception_exit(prev_state);
 }
+NOKPROBE_SYMBOL(do_int3);
 
 #ifdef CONFIG_X86_64
 /*
@@ -361,7 +364,7 @@ exit:
  * for scheduling or signal handling. The actual stack switch is done in
  * entry.S
  */
-asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
+asmlinkage struct pt_regs *sync_regs(struct pt_regs *eregs)
 {
struct pt_regs *regs = eregs;
/* Did already sync */
@@ -380,6 +383,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct 
pt_regs *eregs)
*regs = *eregs;
return regs;
 }
+NOKPROBE_SYMBOL(sync_regs);
 #endif
 
 /*
@@ -406,7 +410,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct 
pt_regs *eregs)
  *
  * May run on IST stack.
  */
-dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
+dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 {
struct task_struct *tsk = current;
enum ctx_state prev_state;
@@ -486,6 +490,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, 
long error_code)
 exit:
exception_exit(prev_state);
 }
+NOKPROBE_SYMBOL(do_debug);
 
 /*
  * Note that we play around with the 'TS' bit in an attempt to get
@@ -657,7 +662,7 @@ void math_state_restore(void)
 }
 EXPORT_SYMBOL_GPL(math_state_restore);
 
-dotraplinkage void __kprobes
+dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
enum ctx_state prev_state;
@@ -683,6 +688,7 @@ do_device_not_available(struct pt_regs *regs, long 
error_code)
 #endif
exception_exit(prev_state);
 }
+NOKPROBE_SYMBOL(do_device_not_available);
 
 #ifdef CONFIG_X86_32
 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -tip RFC v2 12/22] x86/hw_breakpoint: Use NOKPROBE_SYMBOL macro in hw_breakpoint

2013-11-14 Thread Masami Hiramatsu

Use NOKPROBE_SYMBOL macro to protect functions from kprobes
instead of __kprobe annotation in hw_breakpoint.

Signed-off-by: Masami Hiramatsu 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: Andrew Morton 
Cc: Oleg Nesterov 
---
 arch/x86/kernel/hw_breakpoint.c |6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index f66ff16..cb4df84 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -425,7 +425,7 @@ EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
  * NOTIFY_STOP returned for all other cases
  *
  */
-static int __kprobes hw_breakpoint_handler(struct die_args *args)
+static int hw_breakpoint_handler(struct die_args *args)
 {
int i, cpu, rc = NOTIFY_STOP;
struct perf_event *bp;
@@ -508,11 +508,12 @@ static int __kprobes hw_breakpoint_handler(struct 
die_args *args)
 
return rc;
 }
+NOKPROBE_SYMBOL(hw_breakpoint_handler);
 
 /*
  * Handle debug exception notifications.
  */
-int __kprobes hw_breakpoint_exceptions_notify(
+int hw_breakpoint_exceptions_notify(
struct notifier_block *unused, unsigned long val, void *data)
 {
if (val != DIE_DEBUG)
@@ -520,6 +521,7 @@ int __kprobes hw_breakpoint_exceptions_notify(
 
return hw_breakpoint_handler(data);
 }
+NOKPROBE_SYMBOL(hw_breakpoint_exceptions_notify);
 
 void hw_breakpoint_pmu_read(struct perf_event *bp)
 {

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -tip RFC v2 02/22] kprobes: Introduce NOKPROBE_SYMBOL() macro for blacklist

2013-11-14 Thread Masami Hiramatsu

Introduce NOKPROBE_SYMBOL() macro which builds a kprobe
blacklist in build time. The usage of this macro is similar
to the EXPORT_SYMBOL, put the NOKPROBE_SYMBOL(function); just
after the function definition.

If CONFIG_KPROBES=y, the macro is expanded to the definition
of a static data structure of kprobe_blackpoint which is
initialized for the function and put the address of the data
structure in the "_kprobe_blacklist" section.

Since the data structures are not fully initialized by the
macro (because there is no "size" information),  those
are re-initialized at boot time by using kallsyms.

Changes from previous version:
 - fix indent of the macro by using tabs.
 - fix macro for expanding nested macro.
 - update Documentations/kprobes.txt

Signed-off-by: Masami Hiramatsu 
Cc: Ananth N Mavinakayanahalli 
Cc: "David S. Miller" 
Cc: Rob Landley 
Cc: Jeremy Fitzhardinge 
Cc: Chris Wright 
Cc: Alok Kataria 
Cc: Rusty Russell 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: Arnd Bergmann 
Cc: Peter Zijlstra 
---
 Documentation/kprobes.txt |   16 ++-
 arch/x86/kernel/paravirt.c|4 ++
 include/asm-generic/vmlinux.lds.h |9 
 include/linux/kprobes.h   |   20 
 kernel/kprobes.c  |   88 ++---
 kernel/sched/core.c   |1 
 6 files changed, 91 insertions(+), 47 deletions(-)

diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 0cfb00f..7062631 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -22,8 +22,9 @@ Appendix B: The kprobes sysctl interface
 
 Kprobes enables you to dynamically break into any kernel routine and
 collect debugging and performance information non-disruptively. You
-can trap at almost any kernel code address, specifying a handler
+can trap at almost any kernel code address(*), specifying a handler
 routine to be invoked when the breakpoint is hit.
+(*: at some part of kernel code can not be trapped, see 1.5 Blacklist)
 
 There are currently three types of probes: kprobes, jprobes, and
 kretprobes (also called return probes).  A kprobe can be inserted
@@ -273,6 +274,19 @@ using one of the following techniques:
  or
 - Execute 'sysctl -w debug.kprobes_optimization=n'
 
+1.5 Blacklist
+
+Kprobes can probe almost of the kernel except itself. This means
+that there are some functions where kprobes cannot probe. Probing
+(trapping) such functions can cause recursive trap (e.g. double
+fault) or at least the nested probe handler never be called.
+Kprobes manages such functions as a blacklist.
+If you want to add a function into the blacklist, you just need
+to (1) include linux/kprobes.h and (2) use NOKPROBE_SYMBOL() macro
+to specify a blacklisted function.
+Kprobes checks given probe address with the blacklist and reject
+registering if the given address is in the blacklist.
+
 2. Architectures Supported
 
 Kprobes, jprobes, and return probes are implemented on the following
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b10af8..4c785fd 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -389,6 +390,9 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.end_context_switch = paravirt_nop,
 };
 
+/* At this point, native_get_debugreg has real function entry */
+NOKPROBE_SYMBOL(native_get_debugreg);
+
 struct pv_apic_ops pv_apic_ops = {
 #ifdef CONFIG_X86_LOCAL_APIC
.startup_ipi_hook = paravirt_nop,
diff --git a/include/asm-generic/vmlinux.lds.h 
b/include/asm-generic/vmlinux.lds.h
index 83e2c31..294ea96 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -109,6 +109,14 @@
 #define BRANCH_PROFILE()
 #endif
 
+#ifdef CONFIG_KPROBES
+#define KPROBE_BLACKLIST() VMLINUX_SYMBOL(__start_kprobe_blacklist) = .; \
+   *(_kprobe_blacklist)  \
+   VMLINUX_SYMBOL(__stop_kprobe_blacklist) = .;
+#else
+#define KPROBE_BLACKLIST()
+#endif
+
 #ifdef CONFIG_EVENT_TRACING
 #define FTRACE_EVENTS(). = ALIGN(8);   
\
VMLINUX_SYMBOL(__start_ftrace_events) = .;  \
@@ -487,6 +495,7 @@
*(.init.rodata) \
FTRACE_EVENTS() \
TRACE_SYSCALLS()\
+   KPROBE_BLACKLIST()  \
MEM_DISCARD(init.rodata)\
CLK_OF_TABLES() \
CLKSRC_OF_TABLES()  \
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 925eaf2..404cfca 100644
--- a/include/linux/kprobes.h
+++

[PATCH -tip RFC v2 09/22] kprobes: Use NOKPROBE_SYMBOL macro instead of __kprobes

2013-11-14 Thread Masami Hiramatsu

Use NOKPROBE_SYMBOL macro to protect functions from
kprobes instead of __kprobes annotation.

Signed-off-by: Masami Hiramatsu 
Cc: Ananth N Mavinakayanahalli 
Cc: "David S. Miller" 
---
 kernel/kprobes.c |   67 +-
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9345bc7..87942f4 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -302,7 +302,7 @@ static inline void reset_kprobe_instance(void)
  * OR
  * - with preemption disabled - from arch/xxx/kernel/kprobes.c
  */
-struct kprobe __kprobes *get_kprobe(void *addr)
+struct kprobe *get_kprobe(void *addr)
 {
struct hlist_head *head;
struct kprobe *p;
@@ -315,8 +315,9 @@ struct kprobe __kprobes *get_kprobe(void *addr)
 
return NULL;
 }
+NOKPROBE_SYMBOL(get_kprobe);
 
-static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
+static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
 
 /* Return true if the kprobe is an aggregator */
 static inline int kprobe_aggrprobe(struct kprobe *p)
@@ -348,7 +349,7 @@ static bool kprobes_allow_optimization;
  * Call all pre_handler on the list, but ignores its return value.
  * This must be called from arch-dep optimized caller.
  */
-void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
+void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
struct kprobe *kp;
 
@@ -360,6 +361,7 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct 
pt_regs *regs)
reset_kprobe_instance();
}
 }
+NOKPROBE_SYMBOL(opt_pre_handler);
 
 /* Free optimized instructions and optimized_kprobe */
 static void free_aggr_kprobe(struct kprobe *p)
@@ -996,7 +998,7 @@ static void disarm_kprobe(struct kprobe *kp, bool reopt)
  * Aggregate handlers for multiple kprobes support - these handlers
  * take care of invoking the individual kprobe handlers on p->list
  */
-static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
struct kprobe *kp;
 
@@ -1010,9 +1012,10 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, 
struct pt_regs *regs)
}
return 0;
 }
+NOKPROBE_SYMBOL(aggr_pre_handler);
 
-static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
-   unsigned long flags)
+static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+ unsigned long flags)
 {
struct kprobe *kp;
 
@@ -1024,9 +1027,10 @@ static void __kprobes aggr_post_handler(struct kprobe 
*p, struct pt_regs *regs,
}
}
 }
+NOKPROBE_SYMBOL(aggr_post_handler);
 
-static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
-   int trapnr)
+static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+ int trapnr)
 {
struct kprobe *cur = __this_cpu_read(kprobe_instance);
 
@@ -1040,8 +1044,9 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, 
struct pt_regs *regs,
}
return 0;
 }
+NOKPROBE_SYMBOL(aggr_fault_handler);
 
-static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
struct kprobe *cur = __this_cpu_read(kprobe_instance);
int ret = 0;
@@ -1053,9 +1058,10 @@ static int __kprobes aggr_break_handler(struct kprobe 
*p, struct pt_regs *regs)
reset_kprobe_instance();
return ret;
 }
+NOKPROBE_SYMBOL(aggr_break_handler);
 
 /* Walks the list and increments nmissed count for multiprobe case */
-void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
+void kprobes_inc_nmissed_count(struct kprobe *p)
 {
struct kprobe *kp;
if (!kprobe_aggrprobe(p)) {
@@ -1066,9 +1072,10 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe 
*p)
}
return;
 }
+NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
 
-void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
-   struct hlist_head *head)
+void recycle_rp_inst(struct kretprobe_instance *ri,
+struct hlist_head *head)
 {
struct kretprobe *rp = ri->rp;
 
@@ -1083,8 +1090,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance 
*ri,
/* Unregistering */
hlist_add_head(>hlist, head);
 }
+NOKPROBE_SYMBOL(recycle_rp_inst);
 
-void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
+void kretprobe_hash_lock(struct task_struct *tsk,
 struct hlist_head **head, unsigned long *flags)
 __acquires(hlist_lock)
 {
@@ -1095,17 +1103,19 @@ __acquires(hlist_lock)
hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_lock_irqsave(hlist_lock, *flags);

[PATCH -tip RFC v2 01/22] kprobes: Prohibit probing on .entry.text code

2013-11-14 Thread Masami Hiramatsu

.entry.text is a code area which is used for interrupt/syscall
entries, and there are many sensitive codes.
Thus, it is better to prohibit probing on all of such codes
instead of a part of that.
Since some symbols are already registered on kprobe blacklist,
this also removes them from the blacklist.

Signed-off-by: Masami Hiramatsu 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: Ananth N Mavinakayanahalli 
Cc: Al Viro 
Cc: Seiji Aguchi 
Cc: Peter Zijlstra 
Cc: Frederic Weisbecker 
---
 arch/x86/kernel/entry_32.S |   33 -
 arch/x86/kernel/entry_64.S |   20 
 kernel/kprobes.c   |   10 +-
 3 files changed, 5 insertions(+), 58 deletions(-)

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 51e2988..02c2fef 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -315,10 +315,6 @@ ENTRY(ret_from_kernel_thread)
 ENDPROC(ret_from_kernel_thread)
 
 /*
- * Interrupt exit functions should be protected against kprobes
- */
-   .pushsection .kprobes.text, "ax"
-/*
  * Return to user mode is not as complex as all this looks,
  * but we want the default path for a system call return to
  * go as quickly as possible which is why some of this is
@@ -372,10 +368,6 @@ need_resched:
 END(resume_kernel)
 #endif
CFI_ENDPROC
-/*
- * End of kprobes section
- */
-   .popsection
 
 /* SYSENTER_RETURN points to after the "sysenter" instruction in
the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
@@ -495,10 +487,6 @@ sysexit_audit:
PTGS_TO_GS_EX
 ENDPROC(ia32_sysenter_target)
 
-/*
- * syscall stub including irq exit should be protected against kprobes
- */
-   .pushsection .kprobes.text, "ax"
# system call handler stub
 ENTRY(system_call)
RING0_INT_FRAME # can't unwind into user space anyway
@@ -691,10 +679,6 @@ syscall_badsys:
jmp resume_userspace
 END(syscall_badsys)
CFI_ENDPROC
-/*
- * End of kprobes section
- */
-   .popsection
 
 .macro FIXUP_ESPFIX_STACK
 /*
@@ -781,10 +765,6 @@ common_interrupt:
 ENDPROC(common_interrupt)
CFI_ENDPROC
 
-/*
- *  Irq entries should be protected against kprobes
- */
-   .pushsection .kprobes.text, "ax"
 #define BUILD_INTERRUPT3(name, nr, fn) \
 ENTRY(name)\
RING0_INT_FRAME;\
@@ -961,10 +941,6 @@ ENTRY(spurious_interrupt_bug)
jmp error_code
CFI_ENDPROC
 END(spurious_interrupt_bug)
-/*
- * End of kprobes section
- */
-   .popsection
 
 #ifdef CONFIG_XEN
 /* Xen doesn't set %esp to be precisely what the normal sysenter
@@ -1239,11 +1215,6 @@ return_to_handler:
jmp *%ecx
 #endif
 
-/*
- * Some functions should be protected against kprobes
- */
-   .pushsection .kprobes.text, "ax"
-
 #ifdef CONFIG_TRACING
 ENTRY(trace_page_fault)
RING0_EC_FRAME
@@ -1453,7 +1424,3 @@ ENTRY(async_page_fault)
 END(async_page_fault)
 #endif
 
-/*
- * End of kprobes section
- */
-   .popsection
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e21b078..c48f8f9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -487,8 +487,6 @@ ENDPROC(native_usergs_sysret64)
TRACE_IRQS_OFF
.endm
 
-/* save complete stack frame */
-   .pushsection .kprobes.text, "ax"
 ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
@@ -517,7 +515,6 @@ ENTRY(save_paranoid)
 1: ret
CFI_ENDPROC
 END(save_paranoid)
-   .popsection
 
 /*
  * A newly forked process directly context switches into this address.
@@ -975,10 +972,6 @@ END(interrupt)
call \func
.endm
 
-/*
- * Interrupt entry/exit should be protected against kprobes
- */
-   .pushsection .kprobes.text, "ax"
/*
 * The interrupt stubs push (~vector+0x80) onto the stack and
 * then jump to common_interrupt.
@@ -1113,10 +1106,6 @@ ENTRY(retint_kernel)
 
CFI_ENDPROC
 END(common_interrupt)
-/*
- * End of kprobes section
- */
-   .popsection
 
 /*
  * APIC interrupts.
@@ -1477,11 +1466,6 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
hyperv_callback_vector hyperv_vector_handler
 #endif /* CONFIG_HYPERV */
 
-/*
- * Some functions should be protected against kprobes
- */
-   .pushsection .kprobes.text, "ax"
-
 paranoidzeroentry_ist debug do_debug DEBUG_STACK
 paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
 paranoiderrorentry stack_segment do_stack_segment
@@ -1898,7 +1882,3 @@ ENTRY(ignore_sysret)
CFI_ENDPROC
 END(ignore_sysret)
 
-/*
- * End of kprobes section
- */
-   .popsection
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a0d367a..ec0dbc7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -96,9 +96,6 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long 
hash)
 static struct kprobe_blackpoint kprobe_blacklist[] = {
{"preempt_schedule",},

[PATCH -tip RFC v2 00/22] kprobes: introduce NOKPROBE_SYMBOL() and general cleaning of kprobe blacklist

2013-11-14 Thread Masami Hiramatsu

Currently the blacklist is maintained by hand in kprobes.c 
which is separated from the function definition and is hard
to catch up the kernel update.

To solve this issue, I've tried to implement new
NOKPROBE_SYMBOL() macro for making kprobe blacklist at 
build time. Since the NOKPROBE_SYMBOL() macros can be placed
right after the function is defined, it is easy to maintain.

This series replaces __kprobes with NOKPROBE_SYMBOL() macro
or apply __always_inline annotation for some cases, because
NOKPROBE_SYMBOL() will inhibit inlining by referring the
symbol address. :(

In this series, I replaced all __kprobes under kernel/ and
arch/x86. For future work, I'd like to replace all the
__kprobes annotation for all archs too. But this is just
for review the impact of the cleanup.

Also, I decided to classify current __kprobes annotation
users who misuse it too many. Most of the preparation,
registration, optimization functions related to kprobes
are not involved in the breakpoint or other exception
handling. This means that those never cause problems
such as infinite recursion if we put kprobes on it.
This also reduces blacklist a lot.

For easy to check the blacklist, this series includes
a patch which provides debugfs interface for the blacklist.
You can see what address region/symbols are not allowed
to probe via /sys/kernel/debug/kprobes/blacklist.

The blacklist now also support modules. :)
kprobes users can make a custom blacklisted functions which
will be called from kprobes handlers. Example codes are also
updated, so you can see how it works.

This series also includes a change which prohibits probing
on the address in .entry.text because the code is used for
very low-level sensitive interrupt/syscall entries. Probing
such code may cause unexpected result (actually most of
that area is already in the kprobe blacklist).
So I've decide to prohibit probing all of them.

After applying this series, I got an empty .kprobes.text :)

$ grep kprobes_text System.map
81604980 T __kprobes_text_end
81604980 T __kprobes_text_start

Thank you,

Changes from previous version:
 - Replace __kprobes with NOKPROBE_SYMBOL() and remove
   unneeded __kprobes on the files compiled on x86.
 - Add blacklist on modules support.
 - Add debugfs interface for blacklist.
 - Fix indent of the NOKPROBE_SYMBOL() by using tabs.
 - Fix NOKPROBE_SYMBOL() for expanding nested macro.
 - Update Documentations/kprobes.txt about blacklist.
---

Masami Hiramatsu (22):
  kprobes: Prohibit probing on .entry.text code
  kprobes: Introduce NOKPROBE_SYMBOL() macro for blacklist
  kprobes: Show blacklist entries via debugfs
  kprobes: Support blacklist functions in module
  kprobes: Use NOKPROBE_SYMBOL() in sample modules
  kprobes/x86: Allow probe on some kprobe preparation functions
  kprobes/x86: Use NOKPROBE_SYMBOL instead of __kprobes
  kprobes: Allow probe on some kprobe functions
  kprobes: Use NOKPROBE_SYMBOL macro instead of __kprobes
  ftrace/kprobes: Allow probing on some preparation functions
  ftrace/kprobes: Use NOKPROBE_SYMBOL macro in ftrace
  x86/hw_breakpoint: Use NOKPROBE_SYMBOL macro in hw_breakpoint
  x86/trap: Use NOKPROBE_SYMBOL macro in trap.c
  x86/fault: Use NOKPROBE_SYMBOL macro in fault.c
  x86/alternative: Use NOKPROBE_SYMBOL macro in alternative.c
  x86/nmi: Use NOKPROBE_SYMBOL macro for nmi handlers
  x86/kvm: Use NOKPROBE_SYMBOL macro in kvm.c
  x86/dumpstack: Use NOKPROBE_SYMBOL macro in dumpstack.c
  [BUGFIX] kprobes/x86: Prohibit probing on debug_stack_*
  [BUGFIX] kprobes: Prohibit probing on func_ptr_is_kernel_text
  notifier: Use NOKPROBE_SYMBOL macro in notifier
  sched: Use NOKPROBE_SYMBOL macro in sched


 Documentation/kprobes.txt|   24 ++
 arch/x86/include/asm/traps.h |2 
 arch/x86/kernel/alternative.c|3 
 arch/x86/kernel/apic/hw_nmi.c|3 
 arch/x86/kernel/cpu/common.c |4 
 arch/x86/kernel/cpu/perf_event.c |3 
 arch/x86/kernel/cpu/perf_event_amd_ibs.c |3 
 arch/x86/kernel/dumpstack.c  |9 -
 arch/x86/kernel/entry_32.S   |   33 --
 arch/x86/kernel/entry_64.S   |   20 -
 arch/x86/kernel/hw_breakpoint.c  |6 
 arch/x86/kernel/kprobes/core.c   |   97 ---
 arch/x86/kernel/kprobes/ftrace.c |   17 +
 arch/x86/kernel/kprobes/opt.c|   32 +-
 arch/x86/kernel/kvm.c|4 
 arch/x86/kernel/nmi.c|   18 +
 arch/x86/kernel/paravirt.c   |4 
 arch/x86/kernel/traps.c  |   20 +
 arch/x86/mm/fault.c  |   28 +-
 include/asm-generic/vmlinux.lds.h|9 +
 include/linux/kprobes.h  |   20 +
 include/linux/module.h   |5 
 kernel/extable.c |2 
 kernel/kprobes.c |  415

Re: [PATCH 6/6] jfs: simplify lmLogSync() via list_last_entry_or_null()

2013-11-14 Thread Jeff Liu

Hi Dave and all,

Please ignore this patch from this series.

On 11/15 2013 12:35 PM, Jeff Liu wrote:
> From: Jie Liu 
> 
> Simplify the code in lmLogSync() via list_last_entry_or_null().
> 
> Signed-off-by: Jie Liu 
> ---
>  fs/jfs/jfs_logmgr.c | 10 +++---
>  1 file changed, 3 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
> index 7b565d0..2f90779 100644
> --- a/fs/jfs/jfs_logmgr.c
> +++ b/fs/jfs/jfs_logmgr.c
> @@ -963,13 +963,9 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
>  
>   if (log->sync == log->syncpt) {
>   LOGSYNC_LOCK(log, flags);
> - if (list_empty(>synclist))
> - log->sync = log->lsn;
> - else {
> - lp = list_entry(log->synclist.next,
> - struct logsyncblk, synclist);
> - log->sync = lp->lsn;
> - }
> + lp = list_last_entry_or_null(>synclist, struct logsyncblk,
> +  synclist);
Here it could be simplified via list_first_entry_or_null(), my apology for this 
mistake.
> + log->sync = lp ? lp->lsn : log->lsn;
>   LOGSYNC_UNLOCK(log, flags);
>  
>   }
> 

Thanks,
-Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Why have another variable deciding a tracepoint?

2013-11-14 Thread Steven Rostedt

I've been reviewing different users of tracepoints and I stumbled
across this:

drivers/gpu/host1x/cdma.c: host1x_cdma_push()

if (host1x_debug_trace_cmdbuf)
trace_host1x_cdma_push(dev_name(cdma_to_channel(cdma)->dev),
   op1, op2);

That host1x_debug_trace_cmdbuf is a variable that gets set by another
debugfs file "trace_cmdbuf" that is custom to this driver.

Why?

The tracepoint host1x_cdma_push is already controlled by either ftrace
or perf. If it gets enabled by perf or ftrace, it still wont be traced
unless we also enable this trace_cmdbuf. Is there some reason for this?
I can't figure it out from the change log: 6236451d83a720 ("gpu:
host1x: Add debug support").

As tracepoints uses jump labels, there is no branch cost associated
with them. That is, they are either a direct jump, or a nop (in most
cases a nop). But here you added the overhead of a conditional branch
depending on this variable.

If this is truly needed, then use TRACE_EVENT_CONDITION() for that
tracepoint.


/me is baffled

-- Steve


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCHv5 05/20] cpufreq: cpufreq-cpu0: add dt node parsing for cooling device properties

2013-11-14 Thread viresh kumar

On Wednesday 13 November 2013 01:16 AM, Eduardo Valentin wrote:
> This patch changes the cpufreq-cpu0 driver to consider if
> a cpu needs cooling (with cpufreq). In case the cooling is needed,
> the cpu0 device tree node needs to be properly configured
> with cooling device properties.
> 
> In case these properties are present,, the driver will
> load a cpufreq cooling device in the system. The cpufreq-cpu0
> driver is not interested in determining how the system should
> be using the cooling device. The driver is responsible
> only of loading the cooling device.
> 
> Describing how the cooling device will be used can be
> accomplished by setting up a thermal zone that references
> and is composed by the cpufreq cooling device.
> 
> Cc: "Rafael J. Wysocki" 
> Cc: Viresh Kumar 
> Cc: Grant Likely 
> Cc: Rob Herring 
> Cc: cpuf...@vger.kernel.org
> Cc: linux...@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: devicetree-disc...@lists.ozlabs.org
> Signed-off-by: Eduardo Valentin 
> ---
>  .../devicetree/bindings/cpufreq/cpufreq-cpu0.txt |  7 +++
>  drivers/cpufreq/Kconfig  |  2 +-
>  drivers/cpufreq/cpufreq-cpu0.c   | 16 
> 
>  3 files changed, 24 insertions(+), 1 deletion(-)

Acked-by: Viresh Kumar 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC PATCH] cpufreq: cpufreq-cpu0: do not allow transitions with regulators suspended

2013-11-14 Thread viresh kumar

On Friday 15 November 2013 03:30 AM, Rafael J. Wysocki wrote:
> I'm not going to apply anything like this.  If I have already, that's been a 
> mistake.
> 
> Do not mix assignments with logical operators in such outrageous ways, please.
> That's completely unreadable and confusing.

Okay... Will get it fixed for existing code as well..
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 6/6] jfs: simplify lmLogSync() via list_last_entry_or_null()

2013-11-14 Thread Jeff Liu

From: Jie Liu 

Simplify the code in lmLogSync() via list_last_entry_or_null().

Signed-off-by: Jie Liu 
---
 fs/jfs/jfs_logmgr.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 7b565d0..2f90779 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -963,13 +963,9 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
 
if (log->sync == log->syncpt) {
LOGSYNC_LOCK(log, flags);
-   if (list_empty(>synclist))
-   log->sync = log->lsn;
-   else {
-   lp = list_entry(log->synclist.next,
-   struct logsyncblk, synclist);
-   log->sync = lp->lsn;
-   }
+   lp = list_last_entry_or_null(>synclist, struct logsyncblk,
+synclist);
+   log->sync = lp ? lp->lsn : log->lsn;
LOGSYNC_UNLOCK(log, flags);
 
}
-- 
1.8.3.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 5/6] jfs: simplify lmNextPage() via list_last_entry_or_null()

2013-11-14 Thread Jeff Liu

From: Jie Liu 

Simplify the code in lmNextPage via list_last_entry_or_null().

Signed-off-by: Jie Liu 
---
 fs/jfs/jfs_logmgr.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 360d27c..7b565d0 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -591,10 +591,7 @@ static int lmNextPage(struct jfs_log * log)
 *  write or queue the full page at the tail of write queue
 */
/* get the tail tblk on commit queue */
-   if (list_empty(>cqueue))
-   tblk = NULL;
-   else
-   tblk = list_entry(log->cqueue.prev, struct tblock, cqueue);
+   tblk = list_last_entry_or_null(>cqueue, struct tblock, cqueue);
 
/* every tblk who has COMMIT record on the current page,
 * and has not been committed, must be on commit queue
-- 
1.8.3.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/6] ubifs: simplify drop_last_node() via list_last_entry_or_null()

2013-11-14 Thread Jeff Liu

From: Jie Liu 

Simplify the code in drop_last_node() via list_last_entry_or_null().

Signed-off-by: Jie Liu 
---
 fs/ubifs/recovery.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 065096e..e46c394 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -603,12 +603,10 @@ static void drop_last_group(struct ubifs_scan_leb *sleb, 
int *offs)
  */
 static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
 {
-   struct ubifs_scan_node *snod;
-
-   if (!list_empty(>nodes)) {
-   snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
- list);
+   struct ubifs_scan_node *snod = list_last_entry_or_null(>nodes,
+  struct ubifs_scan_node, list);
 
+   if (snod) {
dbg_rcvry("dropping last node at %d:%d",
  sleb->lnum, snod->offs);
*offs = snod->offs;
-- 
1.8.3.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/6] gfs2: simplify current_tail() via list_last_entry_or_null()

2013-11-14 Thread Jeff Liu

From: Jie Liu 

Simplify the code in current_tail() via list_last_entry_or_null().

Signed-off-by: Jie Liu 
---
 fs/gfs2/log.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 610613f..555f767 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -441,13 +441,9 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
 
spin_lock(>sd_ail_lock);
 
-   if (list_empty(>sd_ail1_list)) {
-   tail = sdp->sd_log_head;
-   } else {
-   tr = list_entry(sdp->sd_ail1_list.prev, struct gfs2_trans,
-   tr_list);
-   tail = tr->tr_first;
-   }
+   tr = list_last_entry_or_null(>sd_ail1_list, struct gfs2_trans,
+tr_list);
+   tail = tr ? tr->tr_first : sdp->sd_log_head;
 
spin_unlock(>sd_ail_lock);
 
-- 
1.8.3.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/6] xfs: simplify xfs_ail_max() via list_last_entry_or_null()

2013-11-14 Thread Jeff Liu

From: Jie Liu 

Simplify xfs_ail_max() with list_last_entry_or_null(), and move it to
xfs_trans_priv.h as an inline function as now it is one line.

Signed-off-by: Jie Liu 
---
 fs/xfs/xfs_trans_ail.c  | 14 --
 fs/xfs/xfs_trans_priv.h | 12 
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index a728735..af605d0 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -62,20 +62,6 @@ xfs_ail_check(
 #endif /* DEBUG */
 
 /*
- * Return a pointer to the last item in the AIL.  If the AIL is empty, then
- * return NULL.
- */
-static xfs_log_item_t *
-xfs_ail_max(
-   struct xfs_ail  *ailp)
-{
-   if (list_empty(>xa_ail))
-   return NULL;
-
-   return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail);
-}
-
-/*
  * Return a pointer to the item which follows the given item in the AIL.  If
  * the given item is the last item in the list, then return NULL.
  */
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 12e86af..b1d93ae 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -97,6 +97,18 @@ xfs_ail_min(
li_ail);
 }
 
+/*
+ * Return a pointer to the last item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static inline struct xfs_log_item *
+xfs_ail_max(
+   struct xfs_ail  *ailp)
+{
+   return list_last_entry_or_null(>xa_ail, struct xfs_log_item,
+  li_ail);
+}
+
 static inline void
 xfs_trans_ail_update(
struct xfs_ail  *ailp,
-- 
1.8.3.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/6] list: introduce list_last_entry_or_null()

2013-11-14 Thread Jeff Liu

From: Jie Liu 

Introduce a trivial helper list_last_entry_or_null() to fetch the
last entry from a list, return NULL if the list is empty.

Signed-off-by: Jie Liu 
---
 include/linux/list.h | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/include/linux/list.h b/include/linux/list.h
index ef95941..3337249 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -384,6 +384,17 @@ static inline void list_splice_tail_init(struct list_head 
*list,
(!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
 
 /**
+ * list_last_entry_or_null - get the last element from a list
+ * @ptr:   the list head to take the element from.
+ * @type:  the type of the struct this is embedded in.
+ * @member:the name of the list_struct within the struct.
+ *
+ * Note that if the list is empty, it returns NULL.
+ */
+#define list_last_entry_or_null(ptr, type, member) \
+   (!list_empty(ptr) ? list_last_entry(ptr, type, member) : NULL)
+
+/**
  * list_next_entry - get the next element in list
  * @pos:   the type * to cursor
  * @member:the name of the list_struct within the struct.
-- 
1.8.3.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/6] list: introduce list_last_entry_or_null()

2013-11-14 Thread Jeff Liu

Hi Folks,

This patch is trying to introduce a new list helper to retrieve the
last entry or return NULL if the list is empty corresponding to it,
which is inspired by Jiri Pirko's list_first_entry_or_null().

With this trivial helper, we could get a little benefit to simplify
the code logic of xfs_ail_max() and make the code looks a bit more
consistent with xfs_ail_min() which has already been simplified
via list_first_entry_or_null().

Moreover, as a per simple grep against fs as well as driver's tree,
I think there are some other places could make use of it for similar
purpose, here I spread this helper over some file systems as well.

Any comments are welcome!

Thanks,
-Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2] dynamic_debug: add wildcard support to filter files/functions/modules

2013-11-14 Thread Changbin Du

2013/11/7 Joe Perches :
> On Thu, 2013-11-07 at 11:11 +0800, Changbin Du wrote:
>> 2013/11/1 Joe Perches :
>> match_ functions in lib/parser.c just do simple match, they
>> doesn't support wildcards.
>> So it's not useful for us.
>
> It's not meant to be useful so much as be a possible
> generic location for your match_regex function.
>
I misunderstood you. This is a appropriate file to place the matching function.
I have moved it to this file.

I checked the regex_match_foo  functions in  trace_events_filter.c. Per my
reading, I think it's a little tedious, and only support '*' character. They are
glued with tracing framework. So i gave up to re-used them.

I will send new version patch set to you later.

Thanks!
Du, Changbin
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] btrfs: Use trace condition for get_extent tracepoint

2013-11-14 Thread Steven Rostedt

Doing an if statement to test some condition to know if we should
trigger a tracepoint is pointless when tracing is disabled. This just
adds overhead and wastes a branch prediction. This is why the
TRACE_EVENT_CONDITION() was created. It places the check inside the jump
label so that the branch does not happen unless tracing is enabled.

That is, instead of doing:

if (em)
trace_btrfs_get_extent(root, em);

Which is basically this:

if (em)
if (static_key(trace_btrfs_get_extent)) {


Using a TRACE_EVENT_CONDITION() we can just do:

trace_btrfs_get_extent(root, em);

And the condition trace event will do:

if (static_key(trace_btrfs_get_extent)) {
if (em) {
...

The static key is a non conditional jump (or nop) that is faster than
having to check if em is NULL or not.

Signed-off-by: Steven Rostedt 

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 51e3afa..a9ad918 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6173,8 +6173,7 @@ insert:
write_unlock(_tree->lock);
 out:
 
-   if (em)
-   trace_btrfs_get_extent(root, em);
+   trace_btrfs_get_extent(root, em);
 
if (path)
btrfs_free_path(path);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index f18b3b7..4832d75 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -162,12 +162,14 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict,
{ EXTENT_FLAG_LOGGING,  "LOGGING"   },  \
{ EXTENT_FLAG_FILLING,  "FILLING"   })
 
-TRACE_EVENT(btrfs_get_extent,
+TRACE_EVENT_CONDITION(btrfs_get_extent,
 
TP_PROTO(struct btrfs_root *root, struct extent_map *map),
 
TP_ARGS(root, map),
 
+   TP_CONDITION(map),
+
TP_STRUCT__entry(
__field(u64,  root_objectid )
__field(u64,  start )
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1412 matches

Mail list logo