[RFC 15/20] net: dsa: add tree-wide FDB ops

2016-04-27 Thread Vivien Didelot
In order to support cross-chip operations, we need to inform each switch
driver when a port operation occurs in a DSA tree.

Implement tree-wide FDB operations.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/bcm_sf2.c   | 12 
 drivers/net/dsa/mv88e6xxx.c | 12 
 net/dsa/dsa_priv.h  |  9 ++
 net/dsa/slave.c | 68 ++---
 net/dsa/tree.c  | 61 
 5 files changed, 109 insertions(+), 53 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 0a91ea9..6e634e5 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -733,6 +733,9 @@ static int bcm_sf2_sw_fdb_prepare(struct dsa_switch *ds, 
struct dsa_port *dp,
  const struct switchdev_obj_port_fdb *fdb,
  struct switchdev_trans *trans)
 {
+   if (dsa_port_is_external(dp, ds))
+   return -EOPNOTSUPP;
+
/* We do not need to do anything specific here yet */
return 0;
 }
@@ -743,6 +746,9 @@ static void bcm_sf2_sw_fdb_add(struct dsa_switch *ds, 
struct dsa_port *dp,
 {
struct bcm_sf2_priv *priv = ds_to_priv(ds);
 
+   if (dsa_port_is_external(dp, ds))
+   return;
+
if (bcm_sf2_arl_op(priv, 0, dp->port, fdb->addr, fdb->vid, true))
pr_err("%s: failed to add MAC address\n", __func__);
 }
@@ -752,6 +758,9 @@ static int bcm_sf2_sw_fdb_del(struct dsa_switch *ds, struct 
dsa_port *dp,
 {
struct bcm_sf2_priv *priv = ds_to_priv(ds);
 
+   if (dsa_port_is_external(dp, ds))
+   return -EOPNOTSUPP;
+
return bcm_sf2_arl_op(priv, 0, dp->port, fdb->addr, fdb->vid, false);
 }
 
@@ -813,6 +822,9 @@ static int bcm_sf2_sw_fdb_dump(struct dsa_switch *ds, 
struct dsa_port *dp,
unsigned int count = 0;
int ret;
 
+   if (dsa_port_is_external(dp, ds))
+   return -EOPNOTSUPP;
+
dev = ds->ports[dp->port];
 
/* Start search operation */
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 6fef29b..7d29de3 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -2037,6 +2037,9 @@ int mv88e6xxx_port_fdb_prepare(struct dsa_switch *ds, 
struct dsa_port *dp,
   const struct switchdev_obj_port_fdb *fdb,
   struct switchdev_trans *trans)
 {
+   if (dsa_port_is_external(dp, ds))
+   return -EOPNOTSUPP;
+
/* We don't need any dynamic resource from the kernel (yet),
 * so skip the prepare phase.
 */
@@ -2052,6 +2055,9 @@ void mv88e6xxx_port_fdb_add(struct dsa_switch *ds, struct 
dsa_port *dp,
GLOBAL_ATU_DATA_STATE_UC_STATIC;
struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
 
+   if (dsa_port_is_external(dp, ds))
+   return;
+
mutex_lock(>smi_mutex);
if (_mv88e6xxx_port_fdb_load(ds, dp->port, fdb->addr, fdb->vid, state))
netdev_err(ds->ports[dp->port], "failed to load MAC address\n");
@@ -2064,6 +2070,9 @@ int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, struct 
dsa_port *dp,
struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
int ret;
 
+   if (dsa_port_is_external(dp, ds))
+   return -EOPNOTSUPP;
+
mutex_lock(>smi_mutex);
ret = _mv88e6xxx_port_fdb_load(ds, dp->port, fdb->addr, fdb->vid,
   GLOBAL_ATU_DATA_STATE_UNUSED);
@@ -2169,6 +2178,9 @@ int mv88e6xxx_port_fdb_dump(struct dsa_switch *ds, struct 
dsa_port *dp,
u16 fid;
int err;
 
+   if (dsa_port_is_external(dp, ds))
+   return -EOPNOTSUPP;
+
mutex_lock(>smi_mutex);
 
/* Dump port's default Filtering Information Database (VLAN ID 0) */
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 6e08b3d..e8765c3 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct dsa_device_ops {
struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
@@ -51,6 +52,14 @@ int dsa_tree_bridge_port_join(struct dsa_switch_tree *dst, 
struct dsa_port *dp,
  struct net_device *br);
 void dsa_tree_bridge_port_leave(struct dsa_switch_tree *dst,
struct dsa_port *dp, struct net_device *br);
+int dsa_tree_port_fdb_add(struct dsa_switch_tree *dst, struct dsa_port *dp,
+ const struct switchdev_obj_port_fdb *fdb,
+ struct switchdev_trans *trans);
+int dsa_tree_port_fdb_del(struct dsa_switch_tree *dst, struct dsa_port *dp,
+ const struct switchdev_obj_port_fdb *fdb);
+int dsa_tree_port_fdb_dump(struct dsa_switch_tree *dst, struct dsa_port *dp,
+  struct 

[RFC 04/20] net: dsa: pass dsa_port down to drivers FDB ops

2016-04-27 Thread Vivien Didelot
Now that DSA as proper structure for DSA ports, pass it down to the
port_fdb_{prepare,add,del,dump} driver functions.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/bcm_sf2.c   | 20 +++-
 drivers/net/dsa/mv88e6xxx.c | 22 +++---
 drivers/net/dsa/mv88e6xxx.h |  8 
 include/net/dsa.h   |  8 
 net/dsa/slave.c |  8 
 5 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 2d7b297..f7b53fa 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -725,7 +725,7 @@ static int bcm_sf2_arl_op(struct bcm_sf2_priv *priv, int 
op, int port,
return bcm_sf2_arl_read(priv, mac, vid, , , is_valid);
 }
 
-static int bcm_sf2_sw_fdb_prepare(struct dsa_switch *ds, int port,
+static int bcm_sf2_sw_fdb_prepare(struct dsa_switch *ds, struct dsa_port *dp,
  const struct switchdev_obj_port_fdb *fdb,
  struct switchdev_trans *trans)
 {
@@ -733,22 +733,22 @@ static int bcm_sf2_sw_fdb_prepare(struct dsa_switch *ds, 
int port,
return 0;
 }
 
-static void bcm_sf2_sw_fdb_add(struct dsa_switch *ds, int port,
+static void bcm_sf2_sw_fdb_add(struct dsa_switch *ds, struct dsa_port *dp,
   const struct switchdev_obj_port_fdb *fdb,
   struct switchdev_trans *trans)
 {
struct bcm_sf2_priv *priv = ds_to_priv(ds);
 
-   if (bcm_sf2_arl_op(priv, 0, port, fdb->addr, fdb->vid, true))
+   if (bcm_sf2_arl_op(priv, 0, dp->port, fdb->addr, fdb->vid, true))
pr_err("%s: failed to add MAC address\n", __func__);
 }
 
-static int bcm_sf2_sw_fdb_del(struct dsa_switch *ds, int port,
+static int bcm_sf2_sw_fdb_del(struct dsa_switch *ds, struct dsa_port *dp,
  const struct switchdev_obj_port_fdb *fdb)
 {
struct bcm_sf2_priv *priv = ds_to_priv(ds);
 
-   return bcm_sf2_arl_op(priv, 0, port, fdb->addr, fdb->vid, false);
+   return bcm_sf2_arl_op(priv, 0, dp->port, fdb->addr, fdb->vid, false);
 }
 
 static int bcm_sf2_arl_search_wait(struct bcm_sf2_priv *priv)
@@ -799,16 +799,18 @@ static int bcm_sf2_sw_fdb_copy(struct net_device *dev, 
int port,
return cb(>obj);
 }
 
-static int bcm_sf2_sw_fdb_dump(struct dsa_switch *ds, int port,
+static int bcm_sf2_sw_fdb_dump(struct dsa_switch *ds, struct dsa_port *dp,
   struct switchdev_obj_port_fdb *fdb,
   int (*cb)(struct switchdev_obj *obj))
 {
struct bcm_sf2_priv *priv = ds_to_priv(ds);
-   struct net_device *dev = ds->ports[port];
+   struct net_device *dev;
struct bcm_sf2_arl_entry results[2];
unsigned int count = 0;
int ret;
 
+   dev = ds->ports[dp->port];
+
/* Start search operation */
core_writel(priv, ARLA_SRCH_STDN, CORE_ARLA_SRCH_CTL);
 
@@ -819,12 +821,12 @@ static int bcm_sf2_sw_fdb_dump(struct dsa_switch *ds, int 
port,
 
/* Read both entries, then return their values back */
bcm_sf2_arl_search_rd(priv, 0, [0]);
-   ret = bcm_sf2_sw_fdb_copy(dev, port, [0], fdb, cb);
+   ret = bcm_sf2_sw_fdb_copy(dev, dp->port, [0], fdb, cb);
if (ret)
return ret;
 
bcm_sf2_arl_search_rd(priv, 1, [1]);
-   ret = bcm_sf2_sw_fdb_copy(dev, port, [1], fdb, cb);
+   ret = bcm_sf2_sw_fdb_copy(dev, dp->port, [1], fdb, cb);
if (ret)
return ret;
 
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 3f78c73..c1ff763 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -2031,7 +2031,7 @@ static int _mv88e6xxx_port_fdb_load(struct dsa_switch 
*ds, int port,
return _mv88e6xxx_atu_load(ds, );
 }
 
-int mv88e6xxx_port_fdb_prepare(struct dsa_switch *ds, int port,
+int mv88e6xxx_port_fdb_prepare(struct dsa_switch *ds, struct dsa_port *dp,
   const struct switchdev_obj_port_fdb *fdb,
   struct switchdev_trans *trans)
 {
@@ -2041,7 +2041,7 @@ int mv88e6xxx_port_fdb_prepare(struct dsa_switch *ds, int 
port,
return 0;
 }
 
-void mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
+void mv88e6xxx_port_fdb_add(struct dsa_switch *ds, struct dsa_port *dp,
const struct switchdev_obj_port_fdb *fdb,
struct switchdev_trans *trans)
 {
@@ -2051,19 +2051,19 @@ void mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int 
port,
struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
 
mutex_lock(>smi_mutex);
-   if (_mv88e6xxx_port_fdb_load(ds, port, fdb->addr, fdb->vid, state))
-   netdev_err(ds->ports[port], "failed to load MAC address\n");
+   if 

[RFC 02/20] net: dsa: be consistent with NETDEV_CHANGEUPPER

2016-04-27 Thread Vivien Didelot
Once NETDEV_CHANGEUPPER is emitted, the device is already (un)bridged.

If an error is returned on port_bridge_join, the bridge layer will
rollback the operation and unbridge the port.

Respect this by setting bridge_dev to NULL on error.

Also the DSA layer shouldn't assume that the drivers know about the
bridge device a port was previously bridged to. So pass the bridge
device to port_bridge_leave.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/bcm_sf2.c   |  4 ++--
 drivers/net/dsa/mv88e6xxx.c |  4 ++--
 drivers/net/dsa/mv88e6xxx.h |  3 ++-
 include/net/dsa.h   |  3 ++-
 net/dsa/slave.c | 13 +
 5 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 448deb5..f394ea9 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -525,10 +525,10 @@ static int bcm_sf2_sw_br_join(struct dsa_switch *ds, int 
port,
return 0;
 }
 
-static void bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port)
+static void bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port,
+   struct net_device *bridge)
 {
struct bcm_sf2_priv *priv = ds_to_priv(ds);
-   struct net_device *bridge = priv->port_sts[port].bridge_dev;
unsigned int i;
u32 reg, p_ctl;
 
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 028f92f..86f8f2f 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -2227,10 +2227,10 @@ int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, 
int port,
return err;
 }
 
-void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port)
+void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port,
+struct net_device *bridge)
 {
struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
-   struct net_device *bridge = ps->ports[port].bridge_dev;
int i;
 
mutex_lock(>smi_mutex);
diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx.h
index 0dbe2d1..2eb9a82 100644
--- a/drivers/net/dsa/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx.h
@@ -492,7 +492,8 @@ int mv88e6xxx_set_eee(struct dsa_switch *ds, int port,
  struct phy_device *phydev, struct ethtool_eee *e);
 int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port,
   struct net_device *bridge);
-void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port);
+void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port,
+struct net_device *bridge);
 void mv88e6xxx_port_stp_state_set(struct dsa_switch *ds, int port, u8 state);
 int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port,
  bool vlan_filtering);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 255c108..ed33500 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -305,7 +305,8 @@ struct dsa_switch_driver {
 */
int (*port_bridge_join)(struct dsa_switch *ds, int port,
struct net_device *bridge);
-   void(*port_bridge_leave)(struct dsa_switch *ds, int port);
+   void(*port_bridge_leave)(struct dsa_switch *ds, int port,
+struct net_device *bridge);
void(*port_stp_state_set)(struct dsa_switch *ds, int port,
  u8 state);
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 6115444..f2ec13d 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -443,19 +443,24 @@ static int dsa_slave_bridge_port_join(struct net_device 
*dev,
if (ds->drv->port_bridge_join)
ret = ds->drv->port_bridge_join(ds, p->dp->port, br);
 
-   return ret == -EOPNOTSUPP ? 0 : ret;
+   if (ret && ret != -EOPNOTSUPP) {
+   p->bridge_dev = NULL;
+   return ret;
+   }
+
+   return 0;
 }
 
 static void dsa_slave_bridge_port_leave(struct net_device *dev)
 {
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->dp->ds;
+   struct net_device *br = p->bridge_dev;
 
+   p->bridge_dev = NULL;
 
if (ds->drv->port_bridge_leave)
-   ds->drv->port_bridge_leave(ds, p->dp->port);
-
-   p->bridge_dev = NULL;
+   ds->drv->port_bridge_leave(ds, p->dp->port, br);
 
/* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
 * so allow it to be in BR_STATE_FORWARDING to be kept functional
-- 
2.8.0



[RFC 13/20] net: dsa: list switches in tree

2016-04-27 Thread Vivien Didelot
List the registered dsa_switch structures in a "ds" member of the
dsa_switch_tree structure. This allows the drivers to easily iterate on
the DSA switch structures of their related DSA tree.

Signed-off-by: Vivien Didelot 
---
 include/net/dsa.h | 9 +
 net/dsa/dsa.c | 3 +++
 2 files changed, 12 insertions(+)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 389227d..85fac8a 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -32,11 +32,16 @@ enum dsa_tag_protocol {
 #define DSA_MAX_SWITCHES   4
 #define DSA_MAX_PORTS  12
 
+
+#define dsa_tree_for_each_switch(_dst, _ds)\
+   list_for_each_entry(_ds, &_dst->ds, list)
+
 #define dsa_switch_for_each_port(_ds, _dp, _num_ports) \
for (_dp = list_first_entry(&_ds->dp, typeof(*_dp), list);  \
 &_dp->list != (&_ds->dp) && _dp->port < _num_ports;\
 _dp = list_next_entry(_dp, list))
 
+
 struct dsa_chip_data {
/*
 * How to access the switch configuration registers.
@@ -125,6 +130,8 @@ struct dsa_switch_tree {
 * Data for the individual switch chips.
 */
struct dsa_switch   *switches[DSA_MAX_SWITCHES];
+
+   struct list_headds;
 };
 
 struct dsa_port {
@@ -137,6 +144,8 @@ struct dsa_port {
 };
 
 struct dsa_switch {
+   struct list_headlist;
+
/*
 * Parent switch tree, and switch index.
 */
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index aa4a61a..b0055c7 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -842,6 +842,8 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, 
struct net_device *dev,
int i;
unsigned configured = 0;
 
+   INIT_LIST_HEAD(>ds);
+
dst->pd = pd;
dst->master_netdev = dev;
dst->cpu_switch = -1;
@@ -858,6 +860,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, 
struct net_device *dev,
}
 
dst->switches[i] = ds;
+   list_add_tail(>list, >ds);
 
++configured;
}
-- 
2.8.0



[RFC 08/20] net: dsa: bcm_sf2: use bridge device from dsa_port

2016-04-27 Thread Vivien Didelot
Now that the DSA layer exposes the DSA port structures to drivers, use
that to retrieve the port bridge membership and thus get rid of the
private bridge_dev pointer.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/bcm_sf2.c | 30 ++
 drivers/net/dsa/bcm_sf2.h |  2 --
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index f7b53fa..6e3b844 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -495,25 +495,24 @@ static int bcm_sf2_sw_br_join(struct dsa_switch *ds, 
struct dsa_port *dp,
  struct net_device *bridge)
 {
struct bcm_sf2_priv *priv = ds_to_priv(ds);
-   unsigned int i;
+   struct dsa_port *intp;
u32 reg, p_ctl;
 
-   priv->port_sts[dp->port].bridge_dev = bridge;
p_ctl = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(dp->port));
 
-   for (i = 0; i < priv->hw_params.num_ports; i++) {
-   if (priv->port_sts[i].bridge_dev != bridge)
+   dsa_switch_for_each_port(ds, intp, priv->hw_params.num_ports) {
+   if (intp->br != bridge)
continue;
 
/* Add this local port to the remote port VLAN control
 * membership and update the remote port bitmask
 */
-   reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(i));
+   reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(intp->port));
reg |= 1 << dp->port;
-   core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(i));
-   priv->port_sts[i].vlan_ctl_mask = reg;
+   core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(intp->port));
+   priv->port_sts[intp->port].vlan_ctl_mask = reg;
 
-   p_ctl |= 1 << i;
+   p_ctl |= 1 << intp->port;
}
 
/* Configure the local port VLAN control membership to include
@@ -529,29 +528,28 @@ static void bcm_sf2_sw_br_leave(struct dsa_switch *ds, 
struct dsa_port *dp,
struct net_device *bridge)
 {
struct bcm_sf2_priv *priv = ds_to_priv(ds);
-   unsigned int i;
+   struct dsa_port *intp;
u32 reg, p_ctl;
 
p_ctl = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(dp->port));
 
-   for (i = 0; i < priv->hw_params.num_ports; i++) {
+   dsa_switch_for_each_port(ds, intp, priv->hw_params.num_ports) {
/* Don't touch the remaining ports */
-   if (priv->port_sts[i].bridge_dev != bridge)
+   if (intp->br != bridge)
continue;
 
-   reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(i));
+   reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(intp->port));
reg &= ~(1 << dp->port);
-   core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(i));
+   core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(intp->port));
priv->port_sts[dp->port].vlan_ctl_mask = reg;
 
/* Prevent self removal to preserve isolation */
-   if (dp->port != i)
-   p_ctl &= ~(1 << i);
+   if (dp != intp)
+   p_ctl &= ~(1 << intp->port);
}
 
core_writel(priv, p_ctl, CORE_PORT_VLAN_CTL_PORT(dp->port));
priv->port_sts[dp->port].vlan_ctl_mask = p_ctl;
-   priv->port_sts[dp->port].bridge_dev = NULL;
 }
 
 static void bcm_sf2_sw_br_set_stp_state(struct dsa_switch *ds, int port,
diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h
index 200b1f5..6bba1c9 100644
--- a/drivers/net/dsa/bcm_sf2.h
+++ b/drivers/net/dsa/bcm_sf2.h
@@ -50,8 +50,6 @@ struct bcm_sf2_port_status {
struct ethtool_eee eee;
 
u32 vlan_ctl_mask;
-
-   struct net_device *bridge_dev;
 };
 
 struct bcm_sf2_arl_entry {
-- 
2.8.0



[RFC 15/20] net: dsa: add tree-wide FDB ops

2016-04-27 Thread Vivien Didelot
In order to support cross-chip operations, we need to inform each switch
driver when a port operation occurs in a DSA tree.

Implement tree-wide FDB operations.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/bcm_sf2.c   | 12 
 drivers/net/dsa/mv88e6xxx.c | 12 
 net/dsa/dsa_priv.h  |  9 ++
 net/dsa/slave.c | 68 ++---
 net/dsa/tree.c  | 61 
 5 files changed, 109 insertions(+), 53 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 0a91ea9..6e634e5 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -733,6 +733,9 @@ static int bcm_sf2_sw_fdb_prepare(struct dsa_switch *ds, 
struct dsa_port *dp,
  const struct switchdev_obj_port_fdb *fdb,
  struct switchdev_trans *trans)
 {
+   if (dsa_port_is_external(dp, ds))
+   return -EOPNOTSUPP;
+
/* We do not need to do anything specific here yet */
return 0;
 }
@@ -743,6 +746,9 @@ static void bcm_sf2_sw_fdb_add(struct dsa_switch *ds, 
struct dsa_port *dp,
 {
struct bcm_sf2_priv *priv = ds_to_priv(ds);
 
+   if (dsa_port_is_external(dp, ds))
+   return;
+
if (bcm_sf2_arl_op(priv, 0, dp->port, fdb->addr, fdb->vid, true))
pr_err("%s: failed to add MAC address\n", __func__);
 }
@@ -752,6 +758,9 @@ static int bcm_sf2_sw_fdb_del(struct dsa_switch *ds, struct 
dsa_port *dp,
 {
struct bcm_sf2_priv *priv = ds_to_priv(ds);
 
+   if (dsa_port_is_external(dp, ds))
+   return -EOPNOTSUPP;
+
return bcm_sf2_arl_op(priv, 0, dp->port, fdb->addr, fdb->vid, false);
 }
 
@@ -813,6 +822,9 @@ static int bcm_sf2_sw_fdb_dump(struct dsa_switch *ds, 
struct dsa_port *dp,
unsigned int count = 0;
int ret;
 
+   if (dsa_port_is_external(dp, ds))
+   return -EOPNOTSUPP;
+
dev = ds->ports[dp->port];
 
/* Start search operation */
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 6fef29b..7d29de3 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -2037,6 +2037,9 @@ int mv88e6xxx_port_fdb_prepare(struct dsa_switch *ds, 
struct dsa_port *dp,
   const struct switchdev_obj_port_fdb *fdb,
   struct switchdev_trans *trans)
 {
+   if (dsa_port_is_external(dp, ds))
+   return -EOPNOTSUPP;
+
/* We don't need any dynamic resource from the kernel (yet),
 * so skip the prepare phase.
 */
@@ -2052,6 +2055,9 @@ void mv88e6xxx_port_fdb_add(struct dsa_switch *ds, struct 
dsa_port *dp,
GLOBAL_ATU_DATA_STATE_UC_STATIC;
struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
 
+   if (dsa_port_is_external(dp, ds))
+   return;
+
mutex_lock(>smi_mutex);
if (_mv88e6xxx_port_fdb_load(ds, dp->port, fdb->addr, fdb->vid, state))
netdev_err(ds->ports[dp->port], "failed to load MAC address\n");
@@ -2064,6 +2070,9 @@ int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, struct 
dsa_port *dp,
struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
int ret;
 
+   if (dsa_port_is_external(dp, ds))
+   return -EOPNOTSUPP;
+
mutex_lock(>smi_mutex);
ret = _mv88e6xxx_port_fdb_load(ds, dp->port, fdb->addr, fdb->vid,
   GLOBAL_ATU_DATA_STATE_UNUSED);
@@ -2169,6 +2178,9 @@ int mv88e6xxx_port_fdb_dump(struct dsa_switch *ds, struct 
dsa_port *dp,
u16 fid;
int err;
 
+   if (dsa_port_is_external(dp, ds))
+   return -EOPNOTSUPP;
+
mutex_lock(>smi_mutex);
 
/* Dump port's default Filtering Information Database (VLAN ID 0) */
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 6e08b3d..e8765c3 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct dsa_device_ops {
struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
@@ -51,6 +52,14 @@ int dsa_tree_bridge_port_join(struct dsa_switch_tree *dst, 
struct dsa_port *dp,
  struct net_device *br);
 void dsa_tree_bridge_port_leave(struct dsa_switch_tree *dst,
struct dsa_port *dp, struct net_device *br);
+int dsa_tree_port_fdb_add(struct dsa_switch_tree *dst, struct dsa_port *dp,
+ const struct switchdev_obj_port_fdb *fdb,
+ struct switchdev_trans *trans);
+int dsa_tree_port_fdb_del(struct dsa_switch_tree *dst, struct dsa_port *dp,
+ const struct switchdev_obj_port_fdb *fdb);
+int dsa_tree_port_fdb_dump(struct dsa_switch_tree *dst, struct dsa_port *dp,
+  struct switchdev_obj_port_fdb *fdb,
+  

[RFC 04/20] net: dsa: pass dsa_port down to drivers FDB ops

2016-04-27 Thread Vivien Didelot
Now that DSA as proper structure for DSA ports, pass it down to the
port_fdb_{prepare,add,del,dump} driver functions.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/bcm_sf2.c   | 20 +++-
 drivers/net/dsa/mv88e6xxx.c | 22 +++---
 drivers/net/dsa/mv88e6xxx.h |  8 
 include/net/dsa.h   |  8 
 net/dsa/slave.c |  8 
 5 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 2d7b297..f7b53fa 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -725,7 +725,7 @@ static int bcm_sf2_arl_op(struct bcm_sf2_priv *priv, int 
op, int port,
return bcm_sf2_arl_read(priv, mac, vid, , , is_valid);
 }
 
-static int bcm_sf2_sw_fdb_prepare(struct dsa_switch *ds, int port,
+static int bcm_sf2_sw_fdb_prepare(struct dsa_switch *ds, struct dsa_port *dp,
  const struct switchdev_obj_port_fdb *fdb,
  struct switchdev_trans *trans)
 {
@@ -733,22 +733,22 @@ static int bcm_sf2_sw_fdb_prepare(struct dsa_switch *ds, 
int port,
return 0;
 }
 
-static void bcm_sf2_sw_fdb_add(struct dsa_switch *ds, int port,
+static void bcm_sf2_sw_fdb_add(struct dsa_switch *ds, struct dsa_port *dp,
   const struct switchdev_obj_port_fdb *fdb,
   struct switchdev_trans *trans)
 {
struct bcm_sf2_priv *priv = ds_to_priv(ds);
 
-   if (bcm_sf2_arl_op(priv, 0, port, fdb->addr, fdb->vid, true))
+   if (bcm_sf2_arl_op(priv, 0, dp->port, fdb->addr, fdb->vid, true))
pr_err("%s: failed to add MAC address\n", __func__);
 }
 
-static int bcm_sf2_sw_fdb_del(struct dsa_switch *ds, int port,
+static int bcm_sf2_sw_fdb_del(struct dsa_switch *ds, struct dsa_port *dp,
  const struct switchdev_obj_port_fdb *fdb)
 {
struct bcm_sf2_priv *priv = ds_to_priv(ds);
 
-   return bcm_sf2_arl_op(priv, 0, port, fdb->addr, fdb->vid, false);
+   return bcm_sf2_arl_op(priv, 0, dp->port, fdb->addr, fdb->vid, false);
 }
 
 static int bcm_sf2_arl_search_wait(struct bcm_sf2_priv *priv)
@@ -799,16 +799,18 @@ static int bcm_sf2_sw_fdb_copy(struct net_device *dev, 
int port,
return cb(>obj);
 }
 
-static int bcm_sf2_sw_fdb_dump(struct dsa_switch *ds, int port,
+static int bcm_sf2_sw_fdb_dump(struct dsa_switch *ds, struct dsa_port *dp,
   struct switchdev_obj_port_fdb *fdb,
   int (*cb)(struct switchdev_obj *obj))
 {
struct bcm_sf2_priv *priv = ds_to_priv(ds);
-   struct net_device *dev = ds->ports[port];
+   struct net_device *dev;
struct bcm_sf2_arl_entry results[2];
unsigned int count = 0;
int ret;
 
+   dev = ds->ports[dp->port];
+
/* Start search operation */
core_writel(priv, ARLA_SRCH_STDN, CORE_ARLA_SRCH_CTL);
 
@@ -819,12 +821,12 @@ static int bcm_sf2_sw_fdb_dump(struct dsa_switch *ds, int 
port,
 
/* Read both entries, then return their values back */
bcm_sf2_arl_search_rd(priv, 0, [0]);
-   ret = bcm_sf2_sw_fdb_copy(dev, port, [0], fdb, cb);
+   ret = bcm_sf2_sw_fdb_copy(dev, dp->port, [0], fdb, cb);
if (ret)
return ret;
 
bcm_sf2_arl_search_rd(priv, 1, [1]);
-   ret = bcm_sf2_sw_fdb_copy(dev, port, [1], fdb, cb);
+   ret = bcm_sf2_sw_fdb_copy(dev, dp->port, [1], fdb, cb);
if (ret)
return ret;
 
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 3f78c73..c1ff763 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -2031,7 +2031,7 @@ static int _mv88e6xxx_port_fdb_load(struct dsa_switch 
*ds, int port,
return _mv88e6xxx_atu_load(ds, );
 }
 
-int mv88e6xxx_port_fdb_prepare(struct dsa_switch *ds, int port,
+int mv88e6xxx_port_fdb_prepare(struct dsa_switch *ds, struct dsa_port *dp,
   const struct switchdev_obj_port_fdb *fdb,
   struct switchdev_trans *trans)
 {
@@ -2041,7 +2041,7 @@ int mv88e6xxx_port_fdb_prepare(struct dsa_switch *ds, int 
port,
return 0;
 }
 
-void mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
+void mv88e6xxx_port_fdb_add(struct dsa_switch *ds, struct dsa_port *dp,
const struct switchdev_obj_port_fdb *fdb,
struct switchdev_trans *trans)
 {
@@ -2051,19 +2051,19 @@ void mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int 
port,
struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
 
mutex_lock(>smi_mutex);
-   if (_mv88e6xxx_port_fdb_load(ds, port, fdb->addr, fdb->vid, state))
-   netdev_err(ds->ports[port], "failed to load MAC address\n");
+   if (_mv88e6xxx_port_fdb_load(ds, dp->port, 

[RFC 02/20] net: dsa: be consistent with NETDEV_CHANGEUPPER

2016-04-27 Thread Vivien Didelot
Once NETDEV_CHANGEUPPER is emitted, the device is already (un)bridged.

If an error is returned on port_bridge_join, the bridge layer will
rollback the operation and unbridge the port.

Respect this by setting bridge_dev to NULL on error.

Also the DSA layer shouldn't assume that the drivers know about the
bridge device a port was previously bridged to. So pass the bridge
device to port_bridge_leave.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/bcm_sf2.c   |  4 ++--
 drivers/net/dsa/mv88e6xxx.c |  4 ++--
 drivers/net/dsa/mv88e6xxx.h |  3 ++-
 include/net/dsa.h   |  3 ++-
 net/dsa/slave.c | 13 +
 5 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 448deb5..f394ea9 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -525,10 +525,10 @@ static int bcm_sf2_sw_br_join(struct dsa_switch *ds, int 
port,
return 0;
 }
 
-static void bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port)
+static void bcm_sf2_sw_br_leave(struct dsa_switch *ds, int port,
+   struct net_device *bridge)
 {
struct bcm_sf2_priv *priv = ds_to_priv(ds);
-   struct net_device *bridge = priv->port_sts[port].bridge_dev;
unsigned int i;
u32 reg, p_ctl;
 
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 028f92f..86f8f2f 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -2227,10 +2227,10 @@ int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, 
int port,
return err;
 }
 
-void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port)
+void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port,
+struct net_device *bridge)
 {
struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
-   struct net_device *bridge = ps->ports[port].bridge_dev;
int i;
 
mutex_lock(>smi_mutex);
diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx.h
index 0dbe2d1..2eb9a82 100644
--- a/drivers/net/dsa/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx.h
@@ -492,7 +492,8 @@ int mv88e6xxx_set_eee(struct dsa_switch *ds, int port,
  struct phy_device *phydev, struct ethtool_eee *e);
 int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port,
   struct net_device *bridge);
-void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port);
+void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port,
+struct net_device *bridge);
 void mv88e6xxx_port_stp_state_set(struct dsa_switch *ds, int port, u8 state);
 int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port,
  bool vlan_filtering);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 255c108..ed33500 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -305,7 +305,8 @@ struct dsa_switch_driver {
 */
int (*port_bridge_join)(struct dsa_switch *ds, int port,
struct net_device *bridge);
-   void(*port_bridge_leave)(struct dsa_switch *ds, int port);
+   void(*port_bridge_leave)(struct dsa_switch *ds, int port,
+struct net_device *bridge);
void(*port_stp_state_set)(struct dsa_switch *ds, int port,
  u8 state);
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 6115444..f2ec13d 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -443,19 +443,24 @@ static int dsa_slave_bridge_port_join(struct net_device 
*dev,
if (ds->drv->port_bridge_join)
ret = ds->drv->port_bridge_join(ds, p->dp->port, br);
 
-   return ret == -EOPNOTSUPP ? 0 : ret;
+   if (ret && ret != -EOPNOTSUPP) {
+   p->bridge_dev = NULL;
+   return ret;
+   }
+
+   return 0;
 }
 
 static void dsa_slave_bridge_port_leave(struct net_device *dev)
 {
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->dp->ds;
+   struct net_device *br = p->bridge_dev;
 
+   p->bridge_dev = NULL;
 
if (ds->drv->port_bridge_leave)
-   ds->drv->port_bridge_leave(ds, p->dp->port);
-
-   p->bridge_dev = NULL;
+   ds->drv->port_bridge_leave(ds, p->dp->port, br);
 
/* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
 * so allow it to be in BR_STATE_FORWARDING to be kept functional
-- 
2.8.0



[RFC 13/20] net: dsa: list switches in tree

2016-04-27 Thread Vivien Didelot
List the registered dsa_switch structures in a "ds" member of the
dsa_switch_tree structure. This allows the drivers to easily iterate on
the DSA switch structures of their related DSA tree.

Signed-off-by: Vivien Didelot 
---
 include/net/dsa.h | 9 +
 net/dsa/dsa.c | 3 +++
 2 files changed, 12 insertions(+)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 389227d..85fac8a 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -32,11 +32,16 @@ enum dsa_tag_protocol {
 #define DSA_MAX_SWITCHES   4
 #define DSA_MAX_PORTS  12
 
+
+#define dsa_tree_for_each_switch(_dst, _ds)\
+   list_for_each_entry(_ds, &_dst->ds, list)
+
 #define dsa_switch_for_each_port(_ds, _dp, _num_ports) \
for (_dp = list_first_entry(&_ds->dp, typeof(*_dp), list);  \
 &_dp->list != (&_ds->dp) && _dp->port < _num_ports;\
 _dp = list_next_entry(_dp, list))
 
+
 struct dsa_chip_data {
/*
 * How to access the switch configuration registers.
@@ -125,6 +130,8 @@ struct dsa_switch_tree {
 * Data for the individual switch chips.
 */
struct dsa_switch   *switches[DSA_MAX_SWITCHES];
+
+   struct list_headds;
 };
 
 struct dsa_port {
@@ -137,6 +144,8 @@ struct dsa_port {
 };
 
 struct dsa_switch {
+   struct list_headlist;
+
/*
 * Parent switch tree, and switch index.
 */
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index aa4a61a..b0055c7 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -842,6 +842,8 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, 
struct net_device *dev,
int i;
unsigned configured = 0;
 
+   INIT_LIST_HEAD(>ds);
+
dst->pd = pd;
dst->master_netdev = dev;
dst->cpu_switch = -1;
@@ -858,6 +860,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, 
struct net_device *dev,
}
 
dst->switches[i] = ds;
+   list_add_tail(>list, >ds);
 
++configured;
}
-- 
2.8.0



[RFC 08/20] net: dsa: bcm_sf2: use bridge device from dsa_port

2016-04-27 Thread Vivien Didelot
Now that the DSA layer exposes the DSA port structures to drivers, use
that to retrieve the port bridge membership and thus get rid of the
private bridge_dev pointer.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/bcm_sf2.c | 30 ++
 drivers/net/dsa/bcm_sf2.h |  2 --
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index f7b53fa..6e3b844 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -495,25 +495,24 @@ static int bcm_sf2_sw_br_join(struct dsa_switch *ds, 
struct dsa_port *dp,
  struct net_device *bridge)
 {
struct bcm_sf2_priv *priv = ds_to_priv(ds);
-   unsigned int i;
+   struct dsa_port *intp;
u32 reg, p_ctl;
 
-   priv->port_sts[dp->port].bridge_dev = bridge;
p_ctl = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(dp->port));
 
-   for (i = 0; i < priv->hw_params.num_ports; i++) {
-   if (priv->port_sts[i].bridge_dev != bridge)
+   dsa_switch_for_each_port(ds, intp, priv->hw_params.num_ports) {
+   if (intp->br != bridge)
continue;
 
/* Add this local port to the remote port VLAN control
 * membership and update the remote port bitmask
 */
-   reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(i));
+   reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(intp->port));
reg |= 1 << dp->port;
-   core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(i));
-   priv->port_sts[i].vlan_ctl_mask = reg;
+   core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(intp->port));
+   priv->port_sts[intp->port].vlan_ctl_mask = reg;
 
-   p_ctl |= 1 << i;
+   p_ctl |= 1 << intp->port;
}
 
/* Configure the local port VLAN control membership to include
@@ -529,29 +528,28 @@ static void bcm_sf2_sw_br_leave(struct dsa_switch *ds, 
struct dsa_port *dp,
struct net_device *bridge)
 {
struct bcm_sf2_priv *priv = ds_to_priv(ds);
-   unsigned int i;
+   struct dsa_port *intp;
u32 reg, p_ctl;
 
p_ctl = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(dp->port));
 
-   for (i = 0; i < priv->hw_params.num_ports; i++) {
+   dsa_switch_for_each_port(ds, intp, priv->hw_params.num_ports) {
/* Don't touch the remaining ports */
-   if (priv->port_sts[i].bridge_dev != bridge)
+   if (intp->br != bridge)
continue;
 
-   reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(i));
+   reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(intp->port));
reg &= ~(1 << dp->port);
-   core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(i));
+   core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(intp->port));
priv->port_sts[dp->port].vlan_ctl_mask = reg;
 
/* Prevent self removal to preserve isolation */
-   if (dp->port != i)
-   p_ctl &= ~(1 << i);
+   if (dp != intp)
+   p_ctl &= ~(1 << intp->port);
}
 
core_writel(priv, p_ctl, CORE_PORT_VLAN_CTL_PORT(dp->port));
priv->port_sts[dp->port].vlan_ctl_mask = p_ctl;
-   priv->port_sts[dp->port].bridge_dev = NULL;
 }
 
 static void bcm_sf2_sw_br_set_stp_state(struct dsa_switch *ds, int port,
diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h
index 200b1f5..6bba1c9 100644
--- a/drivers/net/dsa/bcm_sf2.h
+++ b/drivers/net/dsa/bcm_sf2.h
@@ -50,8 +50,6 @@ struct bcm_sf2_port_status {
struct ethtool_eee eee;
 
u32 vlan_ctl_mask;
-
-   struct net_device *bridge_dev;
 };
 
 struct bcm_sf2_arl_entry {
-- 
2.8.0



[RFC 20/20] net: dsa: mv88e6xxx: setup PVT on cross-chip ops

2016-04-27 Thread Vivien Didelot
Switches with a Cross-chip Port VLAN Table are currently configured to
allow cross-chip frames to egress any internal ports. This means that
unbridged cross-chip ports can actually talk to each other, and this is
not what we want.

In order to restrict that, we need to setup the PVT entry for an
external port when it joins or leave a bridge group crossing the switch.

Also initialize the PVT to forbid egressing of cross-chip frames to
internal user ports by default.

Note that a PVT-less switch cannot forbid such frames to egress its
internal ports, unless the kernel supports VLAN filtering. In such
systems, a bridge group is also implemented as a 802.1Q VLAN and thus a
global VTU-based logic can be used to correctly implement cross-chip
hardware bridging. Warn the user if the setup doesn't respect this.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx.c | 98 +++--
 1 file changed, 95 insertions(+), 3 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 4341ffd..e0f9e93 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -2272,8 +2272,29 @@ static int _mv88e6xxx_pvt_cmd(struct dsa_switch *ds, int 
src_dev, int src_port,
return _mv88e6xxx_pvt_wait(ds);
 }
 
+static int _mv88e6xxx_pvt_write(struct dsa_switch *ds, int src_dev,
+   int src_port, u16 data)
+{
+   int err;
+
+   err = _mv88e6xxx_pvt_wait(ds);
+   if (err)
+   return err;
+
+   err = _mv88e6xxx_reg_write(ds, REG_GLOBAL2, GLOBAL2_PVT_DATA, data);
+   if (err)
+   return err;
+
+return _mv88e6xxx_pvt_cmd(ds, src_dev, src_port,
+ GLOBAL2_PVT_ADDR_OP_WRITE_PVLAN);
+}
+
 static int _mv88e6xxx_pvt_init(struct dsa_switch *ds)
 {
+   struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+   struct dsa_port *intp;
+   int src_dev, src_port;
+   u16 pv = 0;
int err;
 
/* Clear 5 Bit Port for usage with Marvell Link Street devices:
@@ -2284,8 +2305,60 @@ static int _mv88e6xxx_pvt_init(struct dsa_switch *ds)
if (err)
return err;
 
-   /* Allow any cross-chip frames to egress any internal ports */
-   return _mv88e6xxx_pvt_cmd(ds, 0, 0, GLOBAL2_PVT_ADDR_OP_INIT_ONES);
+   /* Forbid cross-chip frames to egress internal ports */
+   dsa_switch_for_each_port(ds, intp, ps->info->num_ports)
+   if (dsa_is_cpu_port(ds, intp->port) ||
+   dsa_is_dsa_port(ds, intp->port))
+   pv |= BIT(intp->port);
+
+   for (src_dev = 0; src_dev < 32; ++src_dev) {
+   for (src_port = 0; src_port < 16; ++src_port) {
+   err = _mv88e6xxx_pvt_write(ds, src_dev, src_port, pv);
+   if (err)
+   return err;
+   }
+   }
+
+   return 0;
+}
+
+static int _mv88e6xxx_port_map_pvt(struct dsa_switch *ds, struct dsa_port *dp)
+{
+   struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+   struct dsa_port *intp;
+   u16 pvlan = 0;
+
+   /* Cross-chip frames can egress CPU and DSA ports, and bridge members */
+   dsa_switch_for_each_port(ds, intp, ps->info->num_ports)
+   if (dsa_is_cpu_port(ds, intp->port) ||
+   dsa_is_dsa_port(ds, intp->port) ||
+   (intp->br && intp->br == dp->br))
+   pvlan |= BIT(intp->port);
+
+   return _mv88e6xxx_pvt_write(ds, dp->ds->index, dp->port, pvlan);
+}
+
+static int _mv88e6xxx_remap_pvt(struct dsa_switch *ds,
+   struct net_device *bridge)
+{
+   struct dsa_switch *dsa_sw;
+   struct dsa_port *dsa_p;
+   int err;
+
+   dsa_tree_for_each_switch(ds->dst, dsa_sw) {
+   if (dsa_sw == ds)
+   continue;
+
+   dsa_switch_for_each_port(dsa_sw, dsa_p, DSA_MAX_PORTS) {
+   if (dsa_p->br == bridge) {
+   err = _mv88e6xxx_port_map_pvt(ds, dsa_p);
+   if (err)
+   return err;
+   }
+   }
+   }
+
+   return 0;
 }
 
 int mv88e6xxx_port_bridge_change(struct dsa_switch *ds, struct dsa_port *dp,
@@ -2297,7 +2370,19 @@ int mv88e6xxx_port_bridge_change(struct dsa_switch *ds, 
struct dsa_port *dp,
mutex_lock(>smi_mutex);
 
if (dsa_port_is_external(dp, ds)) {
-   err = -EOPNOTSUPP;
+   /* Forbidding hardware bridging of cross-chip frames requires a
+* Cross-chip Port VLAN Table (PVT), unless VLAN filtering is
+* enabled, in which case a global VTU-based logic works.
+*/
+   if (mv88e6xxx_has(ps, MV88E6XXX_FLAG_PVT)) {
+   err = _mv88e6xxx_port_map_pvt(ds, dp);
+ 

[RFC 01/20] net: dsa: introduce a dsa_port structure

2016-04-27 Thread Vivien Didelot
Introduce a new dsa_port structure, used to store port-centric
information, such as a pointer to its DSA switch and its port number.
It will later contains further data, such as its bridge device.

This is a first step towards implementing cross-chip port operations.

Signed-off-by: Vivien Didelot 
---
 include/net/dsa.h |   5 ++
 net/dsa/dsa.c |  10 +++-
 net/dsa/dsa_priv.h|  13 ++---
 net/dsa/slave.c   | 147 +-
 net/dsa/tag_brcm.c|   4 +-
 net/dsa/tag_dsa.c |   8 +--
 net/dsa/tag_edsa.c|   8 +--
 net/dsa/tag_trailer.c |   2 +-
 8 files changed, 104 insertions(+), 93 deletions(-)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 2d280ab..255c108 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -122,6 +122,11 @@ struct dsa_switch_tree {
struct dsa_switch   *ds[DSA_MAX_SWITCHES];
 };
 
+struct dsa_port {
+   struct dsa_switch   *ds;
+   int port;
+};
+
 struct dsa_switch {
/*
 * Parent switch tree, and switch index.
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index d61ceed..222494c 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -219,6 +219,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, 
struct device *parent)
 {
struct dsa_switch_driver *drv = ds->drv;
struct dsa_switch_tree *dst = ds->dst;
+   struct dsa_port *dp[DSA_MAX_PORTS];
struct dsa_chip_data *pd = ds->pd;
bool valid_name_found = false;
int index = ds->index;
@@ -230,6 +231,13 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, 
struct device *parent)
for (i = 0; i < DSA_MAX_PORTS; i++) {
char *name;
 
+   dp[i] = devm_kzalloc(parent, sizeof(*dp), GFP_KERNEL);
+   if (dp[i] == NULL)
+   return -ENOMEM;
+
+   dp[i]->ds = ds;
+   dp[i]->port = i;
+
name = pd->port_names[i];
if (name == NULL)
continue;
@@ -328,7 +336,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, 
struct device *parent)
if (!(ds->enabled_port_mask & (1 << i)))
continue;
 
-   ret = dsa_slave_create(ds, parent, i, pd->port_names[i]);
+   ret = dsa_slave_create(dp[i], parent, pd->port_names[i]);
if (ret < 0) {
netdev_err(dst->master_netdev, "[%d]: can't create dsa 
slave device for port %d(%s): %d\n",
   index, i, pd->port_names[i], ret);
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index dfa3377..c7d5df0 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -26,13 +26,6 @@ struct dsa_slave_priv {
struct net_device *dev);
 
/*
-* Which switch this port is a part of, and the port index
-* for this port.
-*/
-   struct dsa_switch   *parent;
-   u8  port;
-
-   /*
 * The phylib phy_device pointer for the PHY connected
 * to this port.
 */
@@ -46,6 +39,9 @@ struct dsa_slave_priv {
 #ifdef CONFIG_NET_POLL_CONTROLLER
struct netpoll  *netpoll;
 #endif
+
+   /* DSA specific data */
+   struct dsa_port *dp;
 };
 
 /* dsa.c */
@@ -54,8 +50,7 @@ extern char dsa_driver_version[];
 /* slave.c */
 extern const struct dsa_device_ops notag_netdev_ops;
 void dsa_slave_mii_bus_init(struct dsa_switch *ds);
-int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
-int port, char *name);
+int dsa_slave_create(struct dsa_port *dp, struct device *parent, char *name);
 void dsa_slave_destroy(struct net_device *slave_dev);
 int dsa_slave_suspend(struct net_device *slave_dev);
 int dsa_slave_resume(struct net_device *slave_dev);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 3b6750f..6115444 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -61,7 +61,7 @@ static int dsa_slave_get_iflink(const struct net_device *dev)
 {
struct dsa_slave_priv *p = netdev_priv(dev);
 
-   return p->parent->dst->master_netdev->ifindex;
+   return p->dp->ds->dst->master_netdev->ifindex;
 }
 
 static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p)
@@ -72,8 +72,8 @@ static inline bool dsa_port_is_bridged(struct dsa_slave_priv 
*p)
 static int dsa_slave_open(struct net_device *dev)
 {
struct dsa_slave_priv *p = netdev_priv(dev);
-   struct net_device *master = p->parent->dst->master_netdev;
-   struct dsa_switch *ds = p->parent;
+   struct dsa_switch *ds = p->dp->ds;
+   struct net_device *master = ds->dst->master_netdev;
u8 stp_state = dsa_port_is_bridged(p) ?
BR_STATE_BLOCKING : BR_STATE_FORWARDING;
int err;
@@ -99,13 +99,13 @@ static int dsa_slave_open(struct net_device *dev)
}

[RFC 20/20] net: dsa: mv88e6xxx: setup PVT on cross-chip ops

2016-04-27 Thread Vivien Didelot
Switches with a Cross-chip Port VLAN Table are currently configured to
allow cross-chip frames to egress any internal ports. This means that
unbridged cross-chip ports can actually talk to each other, and this is
not what we want.

In order to restrict that, we need to setup the PVT entry for an
external port when it joins or leave a bridge group crossing the switch.

Also initialize the PVT to forbid egressing of cross-chip frames to
internal user ports by default.

Note that a PVT-less switch cannot forbid such frames to egress its
internal ports, unless the kernel supports VLAN filtering. In such
systems, a bridge group is also implemented as a 802.1Q VLAN and thus a
global VTU-based logic can be used to correctly implement cross-chip
hardware bridging. Warn the user if the setup doesn't respect this.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx.c | 98 +++--
 1 file changed, 95 insertions(+), 3 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 4341ffd..e0f9e93 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -2272,8 +2272,29 @@ static int _mv88e6xxx_pvt_cmd(struct dsa_switch *ds, int 
src_dev, int src_port,
return _mv88e6xxx_pvt_wait(ds);
 }
 
+static int _mv88e6xxx_pvt_write(struct dsa_switch *ds, int src_dev,
+   int src_port, u16 data)
+{
+   int err;
+
+   err = _mv88e6xxx_pvt_wait(ds);
+   if (err)
+   return err;
+
+   err = _mv88e6xxx_reg_write(ds, REG_GLOBAL2, GLOBAL2_PVT_DATA, data);
+   if (err)
+   return err;
+
+return _mv88e6xxx_pvt_cmd(ds, src_dev, src_port,
+ GLOBAL2_PVT_ADDR_OP_WRITE_PVLAN);
+}
+
 static int _mv88e6xxx_pvt_init(struct dsa_switch *ds)
 {
+   struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+   struct dsa_port *intp;
+   int src_dev, src_port;
+   u16 pv = 0;
int err;
 
/* Clear 5 Bit Port for usage with Marvell Link Street devices:
@@ -2284,8 +2305,60 @@ static int _mv88e6xxx_pvt_init(struct dsa_switch *ds)
if (err)
return err;
 
-   /* Allow any cross-chip frames to egress any internal ports */
-   return _mv88e6xxx_pvt_cmd(ds, 0, 0, GLOBAL2_PVT_ADDR_OP_INIT_ONES);
+   /* Forbid cross-chip frames to egress internal ports */
+   dsa_switch_for_each_port(ds, intp, ps->info->num_ports)
+   if (dsa_is_cpu_port(ds, intp->port) ||
+   dsa_is_dsa_port(ds, intp->port))
+   pv |= BIT(intp->port);
+
+   for (src_dev = 0; src_dev < 32; ++src_dev) {
+   for (src_port = 0; src_port < 16; ++src_port) {
+   err = _mv88e6xxx_pvt_write(ds, src_dev, src_port, pv);
+   if (err)
+   return err;
+   }
+   }
+
+   return 0;
+}
+
+static int _mv88e6xxx_port_map_pvt(struct dsa_switch *ds, struct dsa_port *dp)
+{
+   struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+   struct dsa_port *intp;
+   u16 pvlan = 0;
+
+   /* Cross-chip frames can egress CPU and DSA ports, and bridge members */
+   dsa_switch_for_each_port(ds, intp, ps->info->num_ports)
+   if (dsa_is_cpu_port(ds, intp->port) ||
+   dsa_is_dsa_port(ds, intp->port) ||
+   (intp->br && intp->br == dp->br))
+   pvlan |= BIT(intp->port);
+
+   return _mv88e6xxx_pvt_write(ds, dp->ds->index, dp->port, pvlan);
+}
+
+static int _mv88e6xxx_remap_pvt(struct dsa_switch *ds,
+   struct net_device *bridge)
+{
+   struct dsa_switch *dsa_sw;
+   struct dsa_port *dsa_p;
+   int err;
+
+   dsa_tree_for_each_switch(ds->dst, dsa_sw) {
+   if (dsa_sw == ds)
+   continue;
+
+   dsa_switch_for_each_port(dsa_sw, dsa_p, DSA_MAX_PORTS) {
+   if (dsa_p->br == bridge) {
+   err = _mv88e6xxx_port_map_pvt(ds, dsa_p);
+   if (err)
+   return err;
+   }
+   }
+   }
+
+   return 0;
 }
 
 int mv88e6xxx_port_bridge_change(struct dsa_switch *ds, struct dsa_port *dp,
@@ -2297,7 +2370,19 @@ int mv88e6xxx_port_bridge_change(struct dsa_switch *ds, 
struct dsa_port *dp,
mutex_lock(>smi_mutex);
 
if (dsa_port_is_external(dp, ds)) {
-   err = -EOPNOTSUPP;
+   /* Forbidding hardware bridging of cross-chip frames requires a
+* Cross-chip Port VLAN Table (PVT), unless VLAN filtering is
+* enabled, in which case a global VTU-based logic works.
+*/
+   if (mv88e6xxx_has(ps, MV88E6XXX_FLAG_PVT)) {
+   err = _mv88e6xxx_port_map_pvt(ds, dp);
+   } else if 

[RFC 01/20] net: dsa: introduce a dsa_port structure

2016-04-27 Thread Vivien Didelot
Introduce a new dsa_port structure, used to store port-centric
information, such as a pointer to its DSA switch and its port number.
It will later contains further data, such as its bridge device.

This is a first step towards implementing cross-chip port operations.

Signed-off-by: Vivien Didelot 
---
 include/net/dsa.h |   5 ++
 net/dsa/dsa.c |  10 +++-
 net/dsa/dsa_priv.h|  13 ++---
 net/dsa/slave.c   | 147 +-
 net/dsa/tag_brcm.c|   4 +-
 net/dsa/tag_dsa.c |   8 +--
 net/dsa/tag_edsa.c|   8 +--
 net/dsa/tag_trailer.c |   2 +-
 8 files changed, 104 insertions(+), 93 deletions(-)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 2d280ab..255c108 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -122,6 +122,11 @@ struct dsa_switch_tree {
struct dsa_switch   *ds[DSA_MAX_SWITCHES];
 };
 
+struct dsa_port {
+   struct dsa_switch   *ds;
+   int port;
+};
+
 struct dsa_switch {
/*
 * Parent switch tree, and switch index.
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index d61ceed..222494c 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -219,6 +219,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, 
struct device *parent)
 {
struct dsa_switch_driver *drv = ds->drv;
struct dsa_switch_tree *dst = ds->dst;
+   struct dsa_port *dp[DSA_MAX_PORTS];
struct dsa_chip_data *pd = ds->pd;
bool valid_name_found = false;
int index = ds->index;
@@ -230,6 +231,13 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, 
struct device *parent)
for (i = 0; i < DSA_MAX_PORTS; i++) {
char *name;
 
+   dp[i] = devm_kzalloc(parent, sizeof(*dp), GFP_KERNEL);
+   if (dp[i] == NULL)
+   return -ENOMEM;
+
+   dp[i]->ds = ds;
+   dp[i]->port = i;
+
name = pd->port_names[i];
if (name == NULL)
continue;
@@ -328,7 +336,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, 
struct device *parent)
if (!(ds->enabled_port_mask & (1 << i)))
continue;
 
-   ret = dsa_slave_create(ds, parent, i, pd->port_names[i]);
+   ret = dsa_slave_create(dp[i], parent, pd->port_names[i]);
if (ret < 0) {
netdev_err(dst->master_netdev, "[%d]: can't create dsa 
slave device for port %d(%s): %d\n",
   index, i, pd->port_names[i], ret);
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index dfa3377..c7d5df0 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -26,13 +26,6 @@ struct dsa_slave_priv {
struct net_device *dev);
 
/*
-* Which switch this port is a part of, and the port index
-* for this port.
-*/
-   struct dsa_switch   *parent;
-   u8  port;
-
-   /*
 * The phylib phy_device pointer for the PHY connected
 * to this port.
 */
@@ -46,6 +39,9 @@ struct dsa_slave_priv {
 #ifdef CONFIG_NET_POLL_CONTROLLER
struct netpoll  *netpoll;
 #endif
+
+   /* DSA specific data */
+   struct dsa_port *dp;
 };
 
 /* dsa.c */
@@ -54,8 +50,7 @@ extern char dsa_driver_version[];
 /* slave.c */
 extern const struct dsa_device_ops notag_netdev_ops;
 void dsa_slave_mii_bus_init(struct dsa_switch *ds);
-int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
-int port, char *name);
+int dsa_slave_create(struct dsa_port *dp, struct device *parent, char *name);
 void dsa_slave_destroy(struct net_device *slave_dev);
 int dsa_slave_suspend(struct net_device *slave_dev);
 int dsa_slave_resume(struct net_device *slave_dev);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 3b6750f..6115444 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -61,7 +61,7 @@ static int dsa_slave_get_iflink(const struct net_device *dev)
 {
struct dsa_slave_priv *p = netdev_priv(dev);
 
-   return p->parent->dst->master_netdev->ifindex;
+   return p->dp->ds->dst->master_netdev->ifindex;
 }
 
 static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p)
@@ -72,8 +72,8 @@ static inline bool dsa_port_is_bridged(struct dsa_slave_priv 
*p)
 static int dsa_slave_open(struct net_device *dev)
 {
struct dsa_slave_priv *p = netdev_priv(dev);
-   struct net_device *master = p->parent->dst->master_netdev;
-   struct dsa_switch *ds = p->parent;
+   struct dsa_switch *ds = p->dp->ds;
+   struct net_device *master = ds->dst->master_netdev;
u8 stp_state = dsa_port_is_bridged(p) ?
BR_STATE_BLOCKING : BR_STATE_FORWARDING;
int err;
@@ -99,13 +99,13 @@ static int dsa_slave_open(struct net_device *dev)
}
 
if (ds->drv->port_enable) 

Re: [PATCH] xfs: idle aild if the AIL is pushed up to the target LSN

2016-04-27 Thread Dave Chinner
On Wed, Apr 27, 2016 at 08:31:38PM +0200, Lucas Stach wrote:
> Am Dienstag, den 26.04.2016, 09:08 +1000 schrieb Dave Chinner:
> [...]
> > > 
> > > > 
> > > > That said, I'm not sure whether there's a notable benefit of
> > > > idling
> > > > for
> > > > 50ms over just scheduling out when we've hit the target lsn. It
> > > > seems
> > > > like that anybody who pushes the target forward again is going to
> > > > wake
> > > > up the thread anyways. On the other hand, if the fs is idle the
> > > > thread
> > > > will eventually schedule out indefinitely. 
> > > Is this a problem? The patch tries to do exactly that: schedule out
> > > aild indefinitely when there is no more work to do as nobody is
> > > pushing
> > > the target LSN forward.
> > If the filesystem is slowly being dirtied, then the aild should't
> > really idle at all.i
> > 
> > Keep in mind that the xfsaild has multiple functions, one of which
> > is a watchdog that catches log space stalls that would otherwise
> > hang the filesystem. Every time we've removed the watchdog function
> > (i.e.  agressively idle the aild) we've had users report random,
> > unreproducable hangs/stalls that have gone away when the watchdog
> > function (i.e. don't idle until the log is covered and completely
> > idle) was re-instated...
> > 
> I can only see xfsaild_push() doing any work after it has hit the
> target LSN if something moves the target LSN forward. You say that
> aggressively idling aild might produce log stalls, which would imply
> there are races in the code where a code path that moves the target LSN
> forward doesn't properly wake up aild.

Well, yes. The code is horrifically complex, there's a heap of
lockless operations along with cross-subsystem co-ordinated
operations done under different locks amongst other things to
provide scalability. History tells me that no matter whether we
*think* we've got it right, there's always another bug lurking.

> Wouldn't this problem also be present when doing non-aggressive idle of
> aild, just the probability of hitting the issue being reduced
> significantly?

Welcome to Risk Management 101.

I've got better things to do with my time than remove a safety net
and the be forced to spend days or even weeks trying to solve all
the subtle, deeply hidden problems that have been around for 20
years that are now exposed to users. That's not a productive use of
the limited amount of XFS developer's time we have available. If you
want to go ahead and do all this, I'll be happy to spend a year
teaching you about how all the log space reservation code works...

> The commit that re-enabled non-aggressive aild idle
> especially mentions some races that have been fixed and I think those
> fixes should allow for agressive aild idle. If they are insufficient it
> wouldn't be safe to idle aild at all, right?

No - the log state machine that covers (idles) the log is the one we
really care about and it guarantees that the AIL is empty. i.e. The
AIL has active items in it until the log is covered and hence, by
definition, it can't be idle until the log is covered.

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: [PATCH] xfs: idle aild if the AIL is pushed up to the target LSN

2016-04-27 Thread Dave Chinner
On Wed, Apr 27, 2016 at 08:31:38PM +0200, Lucas Stach wrote:
> Am Dienstag, den 26.04.2016, 09:08 +1000 schrieb Dave Chinner:
> [...]
> > > 
> > > > 
> > > > That said, I'm not sure whether there's a notable benefit of
> > > > idling
> > > > for
> > > > 50ms over just scheduling out when we've hit the target lsn. It
> > > > seems
> > > > like that anybody who pushes the target forward again is going to
> > > > wake
> > > > up the thread anyways. On the other hand, if the fs is idle the
> > > > thread
> > > > will eventually schedule out indefinitely. 
> > > Is this a problem? The patch tries to do exactly that: schedule out
> > > aild indefinitely when there is no more work to do as nobody is
> > > pushing
> > > the target LSN forward.
> > If the filesystem is slowly being dirtied, then the aild should't
> > really idle at all.i
> > 
> > Keep in mind that the xfsaild has multiple functions, one of which
> > is a watchdog that catches log space stalls that would otherwise
> > hang the filesystem. Every time we've removed the watchdog function
> > (i.e.  agressively idle the aild) we've had users report random,
> > unreproducable hangs/stalls that have gone away when the watchdog
> > function (i.e. don't idle until the log is covered and completely
> > idle) was re-instated...
> > 
> I can only see xfsaild_push() doing any work after it has hit the
> target LSN if something moves the target LSN forward. You say that
> aggressively idling aild might produce log stalls, which would imply
> there are races in the code where a code path that moves the target LSN
> forward doesn't properly wake up aild.

Well, yes. The code is horrifically complex, there's a heap of
lockless operations along with cross-subsystem co-ordinated
operations done under different locks amongst other things to
provide scalability. History tells me that no matter whether we
*think* we've got it right, there's always another bug lurking.

> Wouldn't this problem also be present when doing non-aggressive idle of
> aild, just the probability of hitting the issue being reduced
> significantly?

Welcome to Risk Management 101.

I've got better things to do with my time than remove a safety net
and the be forced to spend days or even weeks trying to solve all
the subtle, deeply hidden problems that have been around for 20
years that are now exposed to users. That's not a productive use of
the limited amount of XFS developer's time we have available. If you
want to go ahead and do all this, I'll be happy to spend a year
teaching you about how all the log space reservation code works...

> The commit that re-enabled non-aggressive aild idle
> especially mentions some races that have been fixed and I think those
> fixes should allow for agressive aild idle. If they are insufficient it
> wouldn't be safe to idle aild at all, right?

No - the log state machine that covers (idles) the log is the one we
really care about and it guarantees that the AIL is empty. i.e. The
AIL has active items in it until the log is covered and hence, by
definition, it can't be idle until the log is covered.

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com


[PATCH 05/15] staging: lustre: ldlm: use accessor macros for l_flags

2016-04-27 Thread James Simmons
From: Bruce Korb 

Convert most of the ldlm lock's l_flags references from direct
bit twiddling to using bit specific macros.  A few multi-bit
operations are left as an exercise for the reader.

The changes are mostly in ldlm, but also in llite, osc and quota.
Also add a multi-bit (mask) test.

Signed-off-by: Bruce Korb 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-2906
Reviewed-by: Keith Mannthey 
Reviewed-on: http://review.whamcloud.com/7963
Reviewed-by: Doug Oucharek 
Reviewed-by: Andreas Dilger 
Signed-off-by: James Simmons 
---
 .../lustre/lustre/include/lustre_dlm_flags.h   |3 +
 drivers/staging/lustre/lustre/ldlm/l_lock.c|4 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_extent.c   |4 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_flock.c|   11 +--
 drivers/staging/lustre/lustre/ldlm/ldlm_internal.h |7 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_lock.c |   95 ++--
 drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c|   26 +++---
 drivers/staging/lustre/lustre/ldlm/ldlm_request.c  |   32 +++
 drivers/staging/lustre/lustre/ldlm/ldlm_resource.c |   12 ++--
 drivers/staging/lustre/lustre/llite/dcache.c   |7 +-
 drivers/staging/lustre/lustre/llite/file.c |6 +-
 drivers/staging/lustre/lustre/llite/namei.c|2 +-
 drivers/staging/lustre/lustre/osc/osc_lock.c   |2 +-
 13 files changed, 102 insertions(+), 109 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h 
b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
index 7f2ba2f..aff0904 100644
--- a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
+++ b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
@@ -381,6 +381,9 @@
 /** test for ldlm_lock flag bit set */
 #define LDLM_TEST_FLAG(_l, _b)(((_l)->l_flags & (_b)) != 0)
 
+/** multi-bit test: are any of mask bits set? */
+#define LDLM_HAVE_MASK(_l, _m) ((_l)->l_flags & LDLM_FL_##_m##_MASK)
+
 /** set a ldlm_lock flag bit */
 #define LDLM_SET_FLAG(_l, _b) ((_l)->l_flags |= (_b))
 
diff --git a/drivers/staging/lustre/lustre/ldlm/l_lock.c 
b/drivers/staging/lustre/lustre/ldlm/l_lock.c
index e5d1344..621323f 100644
--- a/drivers/staging/lustre/lustre/ldlm/l_lock.c
+++ b/drivers/staging/lustre/lustre/ldlm/l_lock.c
@@ -54,7 +54,7 @@ struct ldlm_resource *lock_res_and_lock(struct ldlm_lock 
*lock)
 
lock_res(lock->l_resource);
 
-   lock->l_flags |= LDLM_FL_RES_LOCKED;
+   ldlm_set_res_locked(lock);
return lock->l_resource;
 }
 EXPORT_SYMBOL(lock_res_and_lock);
@@ -65,7 +65,7 @@ EXPORT_SYMBOL(lock_res_and_lock);
 void unlock_res_and_lock(struct ldlm_lock *lock)
 {
/* on server-side resource of lock doesn't change */
-   lock->l_flags &= ~LDLM_FL_RES_LOCKED;
+   ldlm_clear_res_locked(lock);
 
unlock_res(lock->l_resource);
spin_unlock(>l_lock);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c 
b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c
index a803e20..cf1f178 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c
@@ -75,12 +75,12 @@ __u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 
old_kms)
 * just after we finish and take our lock into account in its
 * calculation of the kms
 */
-   lock->l_flags |= LDLM_FL_KMS_IGNORE;
+   ldlm_set_kms_ignore(lock);
 
list_for_each(tmp, >lr_granted) {
lck = list_entry(tmp, struct ldlm_lock, l_res_link);
 
-   if (lck->l_flags & LDLM_FL_KMS_IGNORE)
+   if (ldlm_is_kms_ignore(lck))
continue;
 
if (lck->l_policy_data.l_extent.end >= old_kms)
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c 
b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
index 5102d78..349bfcc 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
@@ -101,8 +101,7 @@ ldlm_flock_destroy(struct ldlm_lock *lock, enum ldlm_mode 
mode, __u64 flags)
LASSERT(hlist_unhashed(>l_exp_flock_hash));
 
list_del_init(>l_res_link);
-   if (flags == LDLM_FL_WAIT_NOREPROC &&
-   !(lock->l_flags & LDLM_FL_FAILED)) {
+   if (flags == LDLM_FL_WAIT_NOREPROC && !ldlm_is_failed(lock)) {
/* client side - set a flag to prevent sending a CANCEL */
lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING;
 
@@ -436,7 +435,7 @@ ldlm_flock_interrupted_wait(void *data)
lock_res_and_lock(lock);
 
/* client side - set flag to prevent lock from being put on LRU list */
-   lock->l_flags |= LDLM_FL_CBPENDING;
+   ldlm_set_cbpending(lock);
unlock_res_and_lock(lock);
 }
 
@@ -520,7 +519,7 @@ 

[PATCH 05/15] staging: lustre: ldlm: use accessor macros for l_flags

2016-04-27 Thread James Simmons
From: Bruce Korb 

Convert most of the ldlm lock's l_flags references from direct
bit twiddling to using bit specific macros.  A few multi-bit
operations are left as an exercise for the reader.

The changes are mostly in ldlm, but also in llite, osc and quota.
Also add a multi-bit (mask) test.

Signed-off-by: Bruce Korb 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-2906
Reviewed-by: Keith Mannthey 
Reviewed-on: http://review.whamcloud.com/7963
Reviewed-by: Doug Oucharek 
Reviewed-by: Andreas Dilger 
Signed-off-by: James Simmons 
---
 .../lustre/lustre/include/lustre_dlm_flags.h   |3 +
 drivers/staging/lustre/lustre/ldlm/l_lock.c|4 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_extent.c   |4 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_flock.c|   11 +--
 drivers/staging/lustre/lustre/ldlm/ldlm_internal.h |7 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_lock.c |   95 ++--
 drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c|   26 +++---
 drivers/staging/lustre/lustre/ldlm/ldlm_request.c  |   32 +++
 drivers/staging/lustre/lustre/ldlm/ldlm_resource.c |   12 ++--
 drivers/staging/lustre/lustre/llite/dcache.c   |7 +-
 drivers/staging/lustre/lustre/llite/file.c |6 +-
 drivers/staging/lustre/lustre/llite/namei.c|2 +-
 drivers/staging/lustre/lustre/osc/osc_lock.c   |2 +-
 13 files changed, 102 insertions(+), 109 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h 
b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
index 7f2ba2f..aff0904 100644
--- a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
+++ b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
@@ -381,6 +381,9 @@
 /** test for ldlm_lock flag bit set */
 #define LDLM_TEST_FLAG(_l, _b)(((_l)->l_flags & (_b)) != 0)
 
+/** multi-bit test: are any of mask bits set? */
+#define LDLM_HAVE_MASK(_l, _m) ((_l)->l_flags & LDLM_FL_##_m##_MASK)
+
 /** set a ldlm_lock flag bit */
 #define LDLM_SET_FLAG(_l, _b) ((_l)->l_flags |= (_b))
 
diff --git a/drivers/staging/lustre/lustre/ldlm/l_lock.c 
b/drivers/staging/lustre/lustre/ldlm/l_lock.c
index e5d1344..621323f 100644
--- a/drivers/staging/lustre/lustre/ldlm/l_lock.c
+++ b/drivers/staging/lustre/lustre/ldlm/l_lock.c
@@ -54,7 +54,7 @@ struct ldlm_resource *lock_res_and_lock(struct ldlm_lock 
*lock)
 
lock_res(lock->l_resource);
 
-   lock->l_flags |= LDLM_FL_RES_LOCKED;
+   ldlm_set_res_locked(lock);
return lock->l_resource;
 }
 EXPORT_SYMBOL(lock_res_and_lock);
@@ -65,7 +65,7 @@ EXPORT_SYMBOL(lock_res_and_lock);
 void unlock_res_and_lock(struct ldlm_lock *lock)
 {
/* on server-side resource of lock doesn't change */
-   lock->l_flags &= ~LDLM_FL_RES_LOCKED;
+   ldlm_clear_res_locked(lock);
 
unlock_res(lock->l_resource);
spin_unlock(>l_lock);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c 
b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c
index a803e20..cf1f178 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c
@@ -75,12 +75,12 @@ __u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 
old_kms)
 * just after we finish and take our lock into account in its
 * calculation of the kms
 */
-   lock->l_flags |= LDLM_FL_KMS_IGNORE;
+   ldlm_set_kms_ignore(lock);
 
list_for_each(tmp, >lr_granted) {
lck = list_entry(tmp, struct ldlm_lock, l_res_link);
 
-   if (lck->l_flags & LDLM_FL_KMS_IGNORE)
+   if (ldlm_is_kms_ignore(lck))
continue;
 
if (lck->l_policy_data.l_extent.end >= old_kms)
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c 
b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
index 5102d78..349bfcc 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
@@ -101,8 +101,7 @@ ldlm_flock_destroy(struct ldlm_lock *lock, enum ldlm_mode 
mode, __u64 flags)
LASSERT(hlist_unhashed(>l_exp_flock_hash));
 
list_del_init(>l_res_link);
-   if (flags == LDLM_FL_WAIT_NOREPROC &&
-   !(lock->l_flags & LDLM_FL_FAILED)) {
+   if (flags == LDLM_FL_WAIT_NOREPROC && !ldlm_is_failed(lock)) {
/* client side - set a flag to prevent sending a CANCEL */
lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING;
 
@@ -436,7 +435,7 @@ ldlm_flock_interrupted_wait(void *data)
lock_res_and_lock(lock);
 
/* client side - set flag to prevent lock from being put on LRU list */
-   lock->l_flags |= LDLM_FL_CBPENDING;
+   ldlm_set_cbpending(lock);
unlock_res_and_lock(lock);
 }
 
@@ -520,7 +519,7 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 
flags, void *data)
 granted:
OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT, 10);
 
-   if (lock->l_flags 

[PATCH 03/15] staging: lustre: obdclass: add LCT_SERVER_SESSION for server session

2016-04-27 Thread James Simmons
From: Wang Di 

Add LCT_SERVER_SESSION for server session, and separate the
server session flag from LCT_SESSION, so to avoid allocating
session info for client stack for each server request, if
client and server are on the same node.

Signed-off-by: Wang Di 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-3806
Reviewed-on: http://review.whamcloud.com/7412
Reviewed-by: John L. Hammond 
Reviewed-by: Mike Pershin 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/include/lu_object.h |4 
 drivers/staging/lustre/lustre/include/obd_class.h |2 +-
 drivers/staging/lustre/lustre/ptlrpc/service.c|3 ++-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lu_object.h 
b/drivers/staging/lustre/lustre/include/lu_object.h
index b5b0c81..a27e76f 100644
--- a/drivers/staging/lustre/lustre/include/lu_object.h
+++ b/drivers/staging/lustre/lustre/include/lu_object.h
@@ -1006,6 +1006,10 @@ enum lu_context_tag {
 */
LCT_LOCAL = 1 << 7,
/**
+* session for server thread
+**/
+   LCT_SERVER_SESSION = BIT(8),
+   /**
 * Set when at least one of keys, having values in this context has
 * non-NULL lu_context_key::lct_exit() method. This is used to
 * optimize lu_context_exit() call.
diff --git a/drivers/staging/lustre/lustre/include/obd_class.h 
b/drivers/staging/lustre/lustre/include/obd_class.h
index 40f7a23..32863bc 100644
--- a/drivers/staging/lustre/lustre/include/obd_class.h
+++ b/drivers/staging/lustre/lustre/include/obd_class.h
@@ -477,7 +477,7 @@ static inline int obd_setup(struct obd_device *obd, struct 
lustre_cfg *cfg)
struct lu_context  session_ctx;
struct lu_env env;
 
-   lu_context_init(_ctx, LCT_SESSION);
+   lu_context_init(_ctx, LCT_SESSION | LCT_SERVER_SESSION);
session_ctx.lc_thread = NULL;
lu_context_enter(_ctx);
 
diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c 
b/drivers/staging/lustre/lustre/ptlrpc/service.c
index 1bbd1d3..fc2632f 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/service.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/service.c
@@ -1649,7 +1649,8 @@ ptlrpc_server_handle_request(struct ptlrpc_service_part 
*svcpt,
at_get(>scp_at_estimate));
}
 
-   rc = lu_context_init(>rq_session, LCT_SESSION | LCT_NOREF);
+   rc = lu_context_init(>rq_session, LCT_SERVER_SESSION |
+  LCT_NOREF);
if (rc) {
CERROR("Failure to initialize session: %d\n", rc);
goto out_req;
-- 
1.7.1



[PATCH 03/15] staging: lustre: obdclass: add LCT_SERVER_SESSION for server session

2016-04-27 Thread James Simmons
From: Wang Di 

Add LCT_SERVER_SESSION for server session, and separate the
server session flag from LCT_SESSION, so to avoid allocating
session info for client stack for each server request, if
client and server are on the same node.

Signed-off-by: Wang Di 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-3806
Reviewed-on: http://review.whamcloud.com/7412
Reviewed-by: John L. Hammond 
Reviewed-by: Mike Pershin 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/include/lu_object.h |4 
 drivers/staging/lustre/lustre/include/obd_class.h |2 +-
 drivers/staging/lustre/lustre/ptlrpc/service.c|3 ++-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lu_object.h 
b/drivers/staging/lustre/lustre/include/lu_object.h
index b5b0c81..a27e76f 100644
--- a/drivers/staging/lustre/lustre/include/lu_object.h
+++ b/drivers/staging/lustre/lustre/include/lu_object.h
@@ -1006,6 +1006,10 @@ enum lu_context_tag {
 */
LCT_LOCAL = 1 << 7,
/**
+* session for server thread
+**/
+   LCT_SERVER_SESSION = BIT(8),
+   /**
 * Set when at least one of keys, having values in this context has
 * non-NULL lu_context_key::lct_exit() method. This is used to
 * optimize lu_context_exit() call.
diff --git a/drivers/staging/lustre/lustre/include/obd_class.h 
b/drivers/staging/lustre/lustre/include/obd_class.h
index 40f7a23..32863bc 100644
--- a/drivers/staging/lustre/lustre/include/obd_class.h
+++ b/drivers/staging/lustre/lustre/include/obd_class.h
@@ -477,7 +477,7 @@ static inline int obd_setup(struct obd_device *obd, struct 
lustre_cfg *cfg)
struct lu_context  session_ctx;
struct lu_env env;
 
-   lu_context_init(_ctx, LCT_SESSION);
+   lu_context_init(_ctx, LCT_SESSION | LCT_SERVER_SESSION);
session_ctx.lc_thread = NULL;
lu_context_enter(_ctx);
 
diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c 
b/drivers/staging/lustre/lustre/ptlrpc/service.c
index 1bbd1d3..fc2632f 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/service.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/service.c
@@ -1649,7 +1649,8 @@ ptlrpc_server_handle_request(struct ptlrpc_service_part 
*svcpt,
at_get(>scp_at_estimate));
}
 
-   rc = lu_context_init(>rq_session, LCT_SESSION | LCT_NOREF);
+   rc = lu_context_init(>rq_session, LCT_SERVER_SESSION |
+  LCT_NOREF);
if (rc) {
CERROR("Failure to initialize session: %d\n", rc);
goto out_req;
-- 
1.7.1



[PATCH 01/15] staging: lustre: llite: reset writeback index in ll_writepages

2016-04-27 Thread James Simmons
From: Jinshan Xiong 

Otherwise after one round the writeback index will become beyond
the file size and ->writepages() turns into an empty operation.

Also, a safety guard is added to limit the wait time for grant to
10 minutes(take recovery into consideration) at maximum in the
osc_enter_cache() function. Otherwise, EDQUOT will be returned to
the applications to start sync write.

Signed-off-by: Jinshan Xiong 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-3416
Reviewed-on: http://review.whamcloud.com/6554
Reviewed-by: Bobi Jam 
Reviewed-by: Niu Yawei 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/llite/rw.c  |5 ++-
 drivers/staging/lustre/lustre/osc/osc_cache.c |   32 +---
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/drivers/staging/lustre/lustre/llite/rw.c 
b/drivers/staging/lustre/lustre/llite/rw.c
index 4ddf8b3..3363977 100644
--- a/drivers/staging/lustre/lustre/llite/rw.c
+++ b/drivers/staging/lustre/lustre/llite/rw.c
@@ -1105,8 +1105,9 @@ int ll_writepages(struct address_space *mapping, struct 
writeback_control *wbc)
 
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) {
if (end == OBD_OBJECT_EOF)
-   end = i_size_read(inode);
-   mapping->writeback_index = (end >> PAGE_SHIFT) + 1;
+   mapping->writeback_index = 0;
+   else
+   mapping->writeback_index = (end >> PAGE_SHIFT) + 1;
}
return result;
 }
diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c 
b/drivers/staging/lustre/lustre/osc/osc_cache.c
index 43d8bcc..dccd309 100644
--- a/drivers/staging/lustre/lustre/osc/osc_cache.c
+++ b/drivers/staging/lustre/lustre/osc/osc_cache.c
@@ -1383,15 +1383,19 @@ static int osc_completion(const struct lu_env *env, 
struct osc_async_page *oap,
return 0;
 }
 
-#define OSC_DUMP_GRANT(cli, fmt, args...) do {   \
+#define OSC_DUMP_GRANT(lvl, cli, fmt, args...) do {  \
struct client_obd *__tmp = (cli); \
-   CDEBUG(D_CACHE, "%s: { dirty: %ld/%ld dirty_pages: %d/%d "\
-  "dropped: %ld avail: %ld, reserved: %ld, flight: %d } " fmt,   \
+   CDEBUG(lvl, "%s: grant { dirty: %ld/%ld dirty_pages: %d/%d "  \
+  "dropped: %ld avail: %ld, reserved: %ld, flight: %d } "\
+  "lru {in list: %d, left: %d, waiters: %d }" fmt,   \
   __tmp->cl_import->imp_obd->obd_name,   \
   __tmp->cl_dirty, __tmp->cl_dirty_max,  \
   atomic_read(_dirty_pages), obd_max_dirty_pages,\
   __tmp->cl_lost_grant, __tmp->cl_avail_grant,   \
-  __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, ##args);  \
+  __tmp->cl_reserved_grant, __tmp->cl_w_in_flight,   \
+  atomic_read(&__tmp->cl_lru_in_list),   \
+  atomic_read(&__tmp->cl_lru_busy),  \
+  atomic_read(&__tmp->cl_lru_shrinkers), ##args);\
 } while (0)
 
 /* caller must hold loi_list_lock */
@@ -1531,7 +1535,7 @@ static int osc_enter_cache_try(struct client_obd *cli,
 {
int rc;
 
-   OSC_DUMP_GRANT(cli, "need:%d.\n", bytes);
+   OSC_DUMP_GRANT(D_CACHE, cli, "need:%d.\n", bytes);
 
rc = osc_reserve_grant(cli, bytes);
if (rc < 0)
@@ -1576,10 +1580,11 @@ static int osc_enter_cache(const struct lu_env *env, 
struct client_obd *cli,
struct osc_object *osc = oap->oap_obj;
struct lov_oinfo *loi = osc->oo_oinfo;
struct osc_cache_waiter ocw;
-   struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+   struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL,
+ LWI_ON_SIGNAL_NOOP, NULL);
int rc = -EDQUOT;
 
-   OSC_DUMP_GRANT(cli, "need:%d.\n", bytes);
+   OSC_DUMP_GRANT(D_CACHE, cli, "need:%d.\n", bytes);
 
spin_lock(>cl_loi_list_lock);
 
@@ -1623,8 +1628,15 @@ static int osc_enter_cache(const struct lu_env *env, 
struct client_obd *cli,
 
spin_lock(>cl_loi_list_lock);
 
-   /* l_wait_event is interrupted by signal */
+   /* l_wait_event is interrupted by signal, or timed out */
if (rc < 0) {
+   if (rc == -ETIMEDOUT) {
+   OSC_DUMP_GRANT(D_ERROR, cli,
+  "try to reserve %d.\n", bytes);
+   osc_extent_tree_dump(D_ERROR, osc);
+   rc = 

[PATCH 04/15] staging: lustre: lmv: kernel crash due to misconfigured MDT

2016-04-27 Thread James Simmons
From: Dmitry Eremin 

There are few places with access to lmv->tgts[] without check for NULL.
Usually it may happens when MDT configured starting from index 1
instead of 0. For example:
mkfs.lustre --reformat --mgs --mdt --index=1 /dev/sdd1

Signed-off-by: Dmitry Eremin 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-4098
Reviewed-on: http://review.whamcloud.com/7941
Reviewed-by: Andreas Dilger 
Reviewed-by: Alex Zhuravlev 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/lmv/lmv_obd.c |  151 --
 1 files changed, 93 insertions(+), 58 deletions(-)

diff --git a/drivers/staging/lustre/lustre/lmv/lmv_obd.c 
b/drivers/staging/lustre/lustre/lmv/lmv_obd.c
index 2f6457f..9e31f6b 100644
--- a/drivers/staging/lustre/lustre/lmv/lmv_obd.c
+++ b/drivers/staging/lustre/lustre/lmv/lmv_obd.c
@@ -132,8 +132,9 @@ static int lmv_set_mdc_active(struct lmv_obd *lmv, struct 
obd_uuid *uuid,
 static struct obd_uuid *lmv_get_uuid(struct obd_export *exp)
 {
struct lmv_obd *lmv = >exp_obd->u.lmv;
+   struct lmv_tgt_desc *tgt = lmv->tgts[0];
 
-   return obd_get_uuid(lmv->tgts[0]->ltd_exp);
+   return tgt ? obd_get_uuid(tgt->ltd_exp) : NULL;
 }
 
 static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
@@ -249,7 +250,6 @@ static int lmv_connect(const struct lu_env *env,
 
 static void lmv_set_timeouts(struct obd_device *obd)
 {
-   struct lmv_tgt_desc   *tgt;
struct lmv_obd  *lmv;
int i;
 
@@ -261,8 +261,10 @@ static void lmv_set_timeouts(struct obd_device *obd)
return;
 
for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+   struct lmv_tgt_desc *tgt = lmv->tgts[i];
+
tgt = lmv->tgts[i];
-   if (!tgt || !tgt->ltd_exp || tgt->ltd_active == 0)
+   if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
continue;
 
obd_set_info_async(NULL, tgt->ltd_exp, sizeof(KEY_INTERMDS),
@@ -302,13 +304,14 @@ static int lmv_init_ea_size(struct obd_export *exp, int 
easize,
return 0;
 
for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-   if (!lmv->tgts[i] || !lmv->tgts[i]->ltd_exp ||
-   lmv->tgts[i]->ltd_active == 0) {
+   struct lmv_tgt_desc *tgt = lmv->tgts[i];
+
+   if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) {
CWARN("%s: NULL export for %d\n", obd->obd_name, i);
continue;
}
 
-   rc = md_init_ea_size(lmv->tgts[i]->ltd_exp, easize, def_easize,
+   rc = md_init_ea_size(tgt->ltd_exp, easize, def_easize,
 cookiesize, def_cookiesize);
if (rc) {
CERROR("%s: obd_init_ea_size() failed on MDT target %d: 
rc = %d\n",
@@ -534,6 +537,15 @@ int lmv_check_connect(struct obd_device *obd)
return -EINVAL;
}
 
+   LASSERT(lmv->tgts);
+
+   if (!lmv->tgts[0]) {
+   mutex_unlock(>lmv_init_mutex);
+   CERROR("%s: no target configured for index 0.\n",
+  obd->obd_name);
+   return -EINVAL;
+   }
+
CDEBUG(D_CONFIG, "Time to connect %s to %s\n",
   lmv->cluuid.uuid, obd->obd_name);
 
@@ -796,6 +808,11 @@ static int lmv_hsm_ct_unregister(struct lmv_obd *lmv, 
unsigned int cmd, int len,
 
/* unregister request (call from llapi_hsm_copytool_fini) */
for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+   struct lmv_tgt_desc *tgt = lmv->tgts[i];
+
+   if (!tgt || !tgt->ltd_exp)
+   continue;
+
/* best effort: try to clean as much as possible
 * (continue on error)
 */
@@ -825,20 +842,28 @@ static int lmv_hsm_ct_register(struct lmv_obd *lmv, 
unsigned int cmd, int len,
 * except if it because of inactive target.
 */
for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-   err = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len, lk, uarg);
+   struct lmv_tgt_desc *tgt = lmv->tgts[i];
+
+   if (!tgt || !tgt->ltd_exp)
+   continue;
+
+   err = obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg);
if (err) {
-   if (lmv->tgts[i]->ltd_active) {
+   if (tgt->ltd_active) {
/* permanent error */
CERROR("error: iocontrol MDC %s on MDTidx %d 
cmd %x: err = %d\n",
-  lmv->tgts[i]->ltd_uuid.uuid,
-  i, cmd, err);
+  tgt->ltd_uuid.uuid, 

[PATCH 04/15] staging: lustre: lmv: kernel crash due to misconfigured MDT

2016-04-27 Thread James Simmons
From: Dmitry Eremin 

There are few places with access to lmv->tgts[] without check for NULL.
Usually it may happens when MDT configured starting from index 1
instead of 0. For example:
mkfs.lustre --reformat --mgs --mdt --index=1 /dev/sdd1

Signed-off-by: Dmitry Eremin 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-4098
Reviewed-on: http://review.whamcloud.com/7941
Reviewed-by: Andreas Dilger 
Reviewed-by: Alex Zhuravlev 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/lmv/lmv_obd.c |  151 --
 1 files changed, 93 insertions(+), 58 deletions(-)

diff --git a/drivers/staging/lustre/lustre/lmv/lmv_obd.c 
b/drivers/staging/lustre/lustre/lmv/lmv_obd.c
index 2f6457f..9e31f6b 100644
--- a/drivers/staging/lustre/lustre/lmv/lmv_obd.c
+++ b/drivers/staging/lustre/lustre/lmv/lmv_obd.c
@@ -132,8 +132,9 @@ static int lmv_set_mdc_active(struct lmv_obd *lmv, struct 
obd_uuid *uuid,
 static struct obd_uuid *lmv_get_uuid(struct obd_export *exp)
 {
struct lmv_obd *lmv = >exp_obd->u.lmv;
+   struct lmv_tgt_desc *tgt = lmv->tgts[0];
 
-   return obd_get_uuid(lmv->tgts[0]->ltd_exp);
+   return tgt ? obd_get_uuid(tgt->ltd_exp) : NULL;
 }
 
 static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
@@ -249,7 +250,6 @@ static int lmv_connect(const struct lu_env *env,
 
 static void lmv_set_timeouts(struct obd_device *obd)
 {
-   struct lmv_tgt_desc   *tgt;
struct lmv_obd  *lmv;
int i;
 
@@ -261,8 +261,10 @@ static void lmv_set_timeouts(struct obd_device *obd)
return;
 
for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+   struct lmv_tgt_desc *tgt = lmv->tgts[i];
+
tgt = lmv->tgts[i];
-   if (!tgt || !tgt->ltd_exp || tgt->ltd_active == 0)
+   if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
continue;
 
obd_set_info_async(NULL, tgt->ltd_exp, sizeof(KEY_INTERMDS),
@@ -302,13 +304,14 @@ static int lmv_init_ea_size(struct obd_export *exp, int 
easize,
return 0;
 
for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-   if (!lmv->tgts[i] || !lmv->tgts[i]->ltd_exp ||
-   lmv->tgts[i]->ltd_active == 0) {
+   struct lmv_tgt_desc *tgt = lmv->tgts[i];
+
+   if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) {
CWARN("%s: NULL export for %d\n", obd->obd_name, i);
continue;
}
 
-   rc = md_init_ea_size(lmv->tgts[i]->ltd_exp, easize, def_easize,
+   rc = md_init_ea_size(tgt->ltd_exp, easize, def_easize,
 cookiesize, def_cookiesize);
if (rc) {
CERROR("%s: obd_init_ea_size() failed on MDT target %d: 
rc = %d\n",
@@ -534,6 +537,15 @@ int lmv_check_connect(struct obd_device *obd)
return -EINVAL;
}
 
+   LASSERT(lmv->tgts);
+
+   if (!lmv->tgts[0]) {
+   mutex_unlock(>lmv_init_mutex);
+   CERROR("%s: no target configured for index 0.\n",
+  obd->obd_name);
+   return -EINVAL;
+   }
+
CDEBUG(D_CONFIG, "Time to connect %s to %s\n",
   lmv->cluuid.uuid, obd->obd_name);
 
@@ -796,6 +808,11 @@ static int lmv_hsm_ct_unregister(struct lmv_obd *lmv, 
unsigned int cmd, int len,
 
/* unregister request (call from llapi_hsm_copytool_fini) */
for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+   struct lmv_tgt_desc *tgt = lmv->tgts[i];
+
+   if (!tgt || !tgt->ltd_exp)
+   continue;
+
/* best effort: try to clean as much as possible
 * (continue on error)
 */
@@ -825,20 +842,28 @@ static int lmv_hsm_ct_register(struct lmv_obd *lmv, 
unsigned int cmd, int len,
 * except if it because of inactive target.
 */
for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
-   err = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len, lk, uarg);
+   struct lmv_tgt_desc *tgt = lmv->tgts[i];
+
+   if (!tgt || !tgt->ltd_exp)
+   continue;
+
+   err = obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg);
if (err) {
-   if (lmv->tgts[i]->ltd_active) {
+   if (tgt->ltd_active) {
/* permanent error */
CERROR("error: iocontrol MDC %s on MDTidx %d 
cmd %x: err = %d\n",
-  lmv->tgts[i]->ltd_uuid.uuid,
-  i, cmd, err);
+  tgt->ltd_uuid.uuid, i, cmd, err);
rc = err;
lk->lk_flags |= LK_FLG_STOP;
/* 

[PATCH 01/15] staging: lustre: llite: reset writeback index in ll_writepages

2016-04-27 Thread James Simmons
From: Jinshan Xiong 

Otherwise after one round the writeback index will become beyond
the file size and ->writepages() turns into an empty operation.

Also, a safety guard is added to limit the wait time for grant to
10 minutes(take recovery into consideration) at maximum in the
osc_enter_cache() function. Otherwise, EDQUOT will be returned to
the applications to start sync write.

Signed-off-by: Jinshan Xiong 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-3416
Reviewed-on: http://review.whamcloud.com/6554
Reviewed-by: Bobi Jam 
Reviewed-by: Niu Yawei 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/llite/rw.c  |5 ++-
 drivers/staging/lustre/lustre/osc/osc_cache.c |   32 +---
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/drivers/staging/lustre/lustre/llite/rw.c 
b/drivers/staging/lustre/lustre/llite/rw.c
index 4ddf8b3..3363977 100644
--- a/drivers/staging/lustre/lustre/llite/rw.c
+++ b/drivers/staging/lustre/lustre/llite/rw.c
@@ -1105,8 +1105,9 @@ int ll_writepages(struct address_space *mapping, struct 
writeback_control *wbc)
 
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) {
if (end == OBD_OBJECT_EOF)
-   end = i_size_read(inode);
-   mapping->writeback_index = (end >> PAGE_SHIFT) + 1;
+   mapping->writeback_index = 0;
+   else
+   mapping->writeback_index = (end >> PAGE_SHIFT) + 1;
}
return result;
 }
diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c 
b/drivers/staging/lustre/lustre/osc/osc_cache.c
index 43d8bcc..dccd309 100644
--- a/drivers/staging/lustre/lustre/osc/osc_cache.c
+++ b/drivers/staging/lustre/lustre/osc/osc_cache.c
@@ -1383,15 +1383,19 @@ static int osc_completion(const struct lu_env *env, 
struct osc_async_page *oap,
return 0;
 }
 
-#define OSC_DUMP_GRANT(cli, fmt, args...) do {   \
+#define OSC_DUMP_GRANT(lvl, cli, fmt, args...) do {  \
struct client_obd *__tmp = (cli); \
-   CDEBUG(D_CACHE, "%s: { dirty: %ld/%ld dirty_pages: %d/%d "\
-  "dropped: %ld avail: %ld, reserved: %ld, flight: %d } " fmt,   \
+   CDEBUG(lvl, "%s: grant { dirty: %ld/%ld dirty_pages: %d/%d "  \
+  "dropped: %ld avail: %ld, reserved: %ld, flight: %d } "\
+  "lru {in list: %d, left: %d, waiters: %d }" fmt,   \
   __tmp->cl_import->imp_obd->obd_name,   \
   __tmp->cl_dirty, __tmp->cl_dirty_max,  \
   atomic_read(_dirty_pages), obd_max_dirty_pages,\
   __tmp->cl_lost_grant, __tmp->cl_avail_grant,   \
-  __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, ##args);  \
+  __tmp->cl_reserved_grant, __tmp->cl_w_in_flight,   \
+  atomic_read(&__tmp->cl_lru_in_list),   \
+  atomic_read(&__tmp->cl_lru_busy),  \
+  atomic_read(&__tmp->cl_lru_shrinkers), ##args);\
 } while (0)
 
 /* caller must hold loi_list_lock */
@@ -1531,7 +1535,7 @@ static int osc_enter_cache_try(struct client_obd *cli,
 {
int rc;
 
-   OSC_DUMP_GRANT(cli, "need:%d.\n", bytes);
+   OSC_DUMP_GRANT(D_CACHE, cli, "need:%d.\n", bytes);
 
rc = osc_reserve_grant(cli, bytes);
if (rc < 0)
@@ -1576,10 +1580,11 @@ static int osc_enter_cache(const struct lu_env *env, 
struct client_obd *cli,
struct osc_object *osc = oap->oap_obj;
struct lov_oinfo *loi = osc->oo_oinfo;
struct osc_cache_waiter ocw;
-   struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+   struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL,
+ LWI_ON_SIGNAL_NOOP, NULL);
int rc = -EDQUOT;
 
-   OSC_DUMP_GRANT(cli, "need:%d.\n", bytes);
+   OSC_DUMP_GRANT(D_CACHE, cli, "need:%d.\n", bytes);
 
spin_lock(>cl_loi_list_lock);
 
@@ -1623,8 +1628,15 @@ static int osc_enter_cache(const struct lu_env *env, 
struct client_obd *cli,
 
spin_lock(>cl_loi_list_lock);
 
-   /* l_wait_event is interrupted by signal */
+   /* l_wait_event is interrupted by signal, or timed out */
if (rc < 0) {
+   if (rc == -ETIMEDOUT) {
+   OSC_DUMP_GRANT(D_ERROR, cli,
+  "try to reserve %d.\n", bytes);
+   osc_extent_tree_dump(D_ERROR, osc);
+   rc = -EDQUOT;
+   }
+
list_del_init(_entry);
goto out;
}

[PATCH 11/15] staging: lustre: mgc: fix 'error handling' issues

2016-04-27 Thread James Simmons
From: Sebastien Buisson 

Fix 'error handling' issues found by Coverity version 6.6.1:
Unchecked return value (CHECKED_RETURN)
Calling function without checking return value.
Argument cannot be negative (NEGATIVE_RETURNS)
Negative value used as argument to a function expecting a
positive value.

Signed-off-by: Sebastien Buisson 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-4055
Reviewed-on: http://review.whamcloud.com/7842
Reviewed-by: Dmitry Eremin 
Reviewed-by: James Nunez 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/mgc/mgc_request.c |6 +-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/drivers/staging/lustre/lustre/mgc/mgc_request.c 
b/drivers/staging/lustre/lustre/mgc/mgc_request.c
index 933f6f6..2311a43 100644
--- a/drivers/staging/lustre/lustre/mgc/mgc_request.c
+++ b/drivers/staging/lustre/lustre/mgc/mgc_request.c
@@ -502,8 +502,12 @@ static void do_requeue(struct config_llog_data *cld)
 */
down_read(>cld_mgcexp->exp_obd->u.cli.cl_sem);
if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) {
+   int rc;
+
CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname);
-   mgc_process_log(cld->cld_mgcexp->exp_obd, cld);
+   rc = mgc_process_log(cld->cld_mgcexp->exp_obd, cld);
+   if (rc && rc != -ENOENT)
+   CERROR("failed processing log: %d\n", rc);
} else {
CDEBUG(D_MGC, "disconnecting, won't update log %s\n",
   cld->cld_logname);
-- 
1.7.1



mmotm 2016-04-27-15-21 uploaded

2016-04-27 Thread akpm
The mm-of-the-moment snapshot 2016-04-27-15-21 has been uploaded to

   http://www.ozlabs.org/~akpm/mmotm/

mmotm-readme.txt says

README for mm-of-the-moment:

http://www.ozlabs.org/~akpm/mmotm/

This is a snapshot of my -mm patch queue.  Uploaded at random hopefully
more than once a week.

You will need quilt to apply these patches to the latest Linus release (4.x
or 4.x-rcY).  The series file is in broken-out.tar.gz and is duplicated in
http://ozlabs.org/~akpm/mmotm/series

The file broken-out.tar.gz contains two datestamp files: .DATE and
.DATE--mm-dd-hh-mm-ss.  Both contain the string -mm-dd-hh-mm-ss,
followed by the base kernel version against which this patch series is to
be applied.

This tree is partially included in linux-next.  To see which patches are
included in linux-next, consult the `series' file.  Only the patches
within the #NEXT_PATCHES_START/#NEXT_PATCHES_END markers are included in
linux-next.

A git tree which contains the memory management portion of this tree is
maintained at git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git
by Michal Hocko.  It contains the patches which are between the
"#NEXT_PATCHES_START mm" and "#NEXT_PATCHES_END" markers, from the series
file, http://www.ozlabs.org/~akpm/mmotm/series.


A full copy of the full kernel tree with the linux-next and mmotm patches
already applied is available through git within an hour of the mmotm
release.  Individual mmotm releases are tagged.  The master branch always
points to the latest release, so it's constantly rebasing.

http://git.cmpxchg.org/cgit.cgi/linux-mmotm.git/

To develop on top of mmotm git:

  $ git remote add mmotm 
git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git
  $ git remote update mmotm
  $ git checkout -b topic mmotm/master
  
  $ git send-email mmotm/master.. [...]

To rebase a branch with older patches to a new mmotm release:

  $ git remote update mmotm
  $ git rebase --onto mmotm/master  topic




The directory http://www.ozlabs.org/~akpm/mmots/ (mm-of-the-second)
contains daily snapshots of the -mm tree.  It is updated more frequently
than mmotm, and is untested.

A git copy of this tree is available at

http://git.cmpxchg.org/cgit.cgi/linux-mmots.git/

and use of this tree is similar to
http://git.cmpxchg.org/cgit.cgi/linux-mmotm.git/, described above.


This mmotm tree contains the following patches against 4.6-rc5:
(patches marked "*" will be included in linux-next)

  origin.patch
* ksm-introduce-ksm_max_page_sharing-per-page-deduplication-limit.patch
  i-need-old-gcc.patch
  arch-alpha-kernel-systblss-remove-debug-check.patch
* kexec-update-vmcoreinfo-for-compound_order-dtor.patch
* kexec-export-offsetpagecompound_head-to-find-out-compound-tail-page.patch
* mm-exclude-hugetlb-pages-from-thp-page_mapped-logic.patch
* thp-keep-huge-zero-page-pinned-until-tlb-flush.patch
* mailmap-fix-krzysztof-kozlowskis-misspelled-name.patch
* mm-huge_memory-replace-vm_no_thp-vm_bug_on-with-actual-vma-check.patch
* numa-fix-proc-pid-numa_maps-for-thp.patch
* mm-vmscan-reclaim-highmem-zone-if-buffer_heads-is-over-limit.patch
* mm-call-swap_slot_free_notify-with-holding-page-lock.patch
* mm-hwpoison-fix-wrong-num_poisoned_pages-account.patch
* mailmap-add-frank-rowand.patch
* mm-wake-kcompactd-before-kswapds-short-sleep.patch
* kcov-dont-trace-the-code-coverage-code.patch
* kcov-dont-profile-branches-in-kcov.patch
* update-email-address.patch
* ocfs2-dlm-return-zero-if-deref_done-message-is-successfully-handled.patch
* mm-memory-failure-fix-race-with-compound-page-split-merge.patch
* rapidio-fix-potential-null-pointer-dereference.patch
* lib-stackdepotc-allow-the-stack-trace-hash-to-be-zero.patch
* mm-update-the-document-of-numa_zonelist_order.patch
* kprobes-add-the-tls-argument-for-j_do_fork.patch
* mm-thp-correct-split_huge_pages-file-permission.patch
* mm-memcontrol-let-v2-cgroups-follow-changes-in-system-swappiness.patch
* rapidio-mport_cdev-fix-uapi-type-definitions.patch
* huge-pagecache-mmap_sem-is-unlocked-when-truncation-splits-pmd.patch
* mm-update-min_free_kbytes-from-khugepaged-after-core-initialization.patch
* mm-cma-prevent-nr_isolated_-counters-from-going-negative.patch
* maintainers-fix-rajendra-nayaks-address.patch
* mm-thp-kvm-fix-memory-corruption-in-kvm-with-thp-enabled.patch
* dax-add-dax_get_unmapped_area-for-pmd-mappings.patch
* ext2-4-xfs-blk-call-dax_get_unmapped_area-for-dax-pmd-mappings.patch
* arm-arch-arm-include-asm-pageh-needs-personalityh.patch
* fsnotify-avoid-spurious-emfile-errors-from-inotify_init.patch
* fsnotify-avoid-spurious-emfile-errors-from-inotify_init-checkpatch-fixes.patch
* scripts-decode_stacktracesh-handle-symbols-in-modules.patch
* scripts-spellingtxt-add-fimware-misspelling.patch
* debugobjects-make-fixup-functions-return-bool-instead-of-int.patch
* debugobjects-correct-the-usage-of-fixup-call-results.patch
* workqueue-update-debugobjects-fixup-callbacks-return-type.patch
* 

[PATCH 09/15] staging: lustre: llite: Replace printing of i_ino with ll_inode2fid()

2016-04-27 Thread James Simmons
From: James Nunez 

The printing of i_ino/i_generation in llite messages is not nearly so
useful as printing the full inode FID, since i_ino is a "compressed"
version of the FID and there may be duplicate values for i_ino in some
cases (especially if running on a 32-bit client).

All instances of printing i_ino/i_generation are replaced with
the FID using ll_inode2fid(). All instances, except for one, of
printing just i_ino was replaced by printing the FID. In all
CERROR lines touched by the i_ino replacements, the device name
or fsname was added at the beginning of the message if it did not
already exist.

Signed-off-by: James Nunez 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-3491
Reviewed-on: http://review.whamcloud.com/6848
Reviewed-by: Andreas Dilger 
Reviewed-by: John L. Hammond 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/llite/dcache.c   |8 +-
 drivers/staging/lustre/lustre/llite/dir.c  |   23 +++---
 drivers/staging/lustre/lustre/llite/file.c |   70 +-
 drivers/staging/lustre/lustre/llite/llite_close.c  |   40 ++-
 .../staging/lustre/lustre/llite/llite_internal.h   |   10 +--
 drivers/staging/lustre/lustre/llite/llite_lib.c|   40 ++-
 drivers/staging/lustre/lustre/llite/llite_mmap.c   |6 +-
 drivers/staging/lustre/lustre/llite/llite_nfs.c|   14 +++--
 drivers/staging/lustre/lustre/llite/namei.c|   76 +---
 drivers/staging/lustre/lustre/llite/rw26.c |5 +-
 drivers/staging/lustre/lustre/llite/statahead.c|   17 ++---
 drivers/staging/lustre/lustre/llite/symlink.c  |   10 ++-
 drivers/staging/lustre/lustre/llite/vvp_dev.c  |5 +-
 drivers/staging/lustre/lustre/llite/xattr.c|   20 +++---
 14 files changed, 174 insertions(+), 170 deletions(-)

diff --git a/drivers/staging/lustre/lustre/llite/dcache.c 
b/drivers/staging/lustre/lustre/llite/dcache.c
index 5596b13..1b6f82a 100644
--- a/drivers/staging/lustre/lustre/llite/dcache.c
+++ b/drivers/staging/lustre/lustre/llite/dcache.c
@@ -250,8 +250,8 @@ void ll_invalidate_aliases(struct inode *inode)
 {
struct dentry *dentry;
 
-   CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n",
-  inode->i_ino, inode->i_generation, inode);
+   CDEBUG(D_INODE, "marking dentries for ino "DFID"(%p) invalid\n",
+  PFID(ll_inode2fid(inode)), inode);
 
ll_lock_dcache(inode);
hlist_for_each_entry(dentry, >i_dentry, d_u.d_alias) {
@@ -286,8 +286,8 @@ void ll_lookup_finish_locks(struct lookup_intent *it, 
struct inode *inode)
if (it->d.lustre.it_lock_mode && inode) {
struct ll_sb_info *sbi = ll_i2sbi(inode);
 
-   CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
-  inode, inode->i_ino, inode->i_generation);
+   CDEBUG(D_DLMTRACE, "setting l_data to inode "DFID"(%p)\n",
+  PFID(ll_inode2fid(inode)), inode);
ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
}
 
diff --git a/drivers/staging/lustre/lustre/llite/dir.c 
b/drivers/staging/lustre/lustre/llite/dir.c
index b457c28..9463da2 100644
--- a/drivers/staging/lustre/lustre/llite/dir.c
+++ b/drivers/staging/lustre/lustre/llite/dir.c
@@ -158,8 +158,8 @@ static int ll_dir_filler(void *_hash, struct page *page0)
int i;
int rc;
 
-   CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) hash %llu\n",
-  inode->i_ino, inode->i_generation, inode, hash);
+   CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) hash %llu\n",
+  PFID(ll_inode2fid(inode)), inode, hash);
 
LASSERT(max_pages > 0 && max_pages <= MD_MAX_BRW_PAGES);
 
@@ -372,8 +372,8 @@ struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
return ERR_PTR(rc);
}
 
-   CDEBUG(D_INODE, "setting lr_lvb_inode to inode %p (%lu/%u)\n",
-  dir, dir->i_ino, dir->i_generation);
+   CDEBUG(D_INODE, "setting lr_lvb_inode to inode "DFID"(%p)\n",
+  PFID(ll_inode2fid(dir)), dir);
md_set_lock_data(ll_i2sbi(dir)->ll_md_exp,
 _lock_handle, dir, NULL);
} else {
@@ -616,9 +616,9 @@ static int ll_readdir(struct file *filp, struct dir_context 
*ctx)
int api32   = ll_need_32bit_api(sbi);
int rc;
 
-   CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu 32bit_api 
%d\n",
-  inode->i_ino, inode->i_generation,
-  inode, (unsigned long)pos, i_size_read(inode), api32);
+   CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) pos %lu/%llu 32bit_api 
%d\n",
+  PFID(ll_inode2fid(inode)), inode, (unsigned long)pos,
+  i_size_read(inode), 

[PATCH 11/15] staging: lustre: mgc: fix 'error handling' issues

2016-04-27 Thread James Simmons
From: Sebastien Buisson 

Fix 'error handling' issues found by Coverity version 6.6.1:
Unchecked return value (CHECKED_RETURN)
Calling function without checking return value.
Argument cannot be negative (NEGATIVE_RETURNS)
Negative value used as argument to a function expecting a
positive value.

Signed-off-by: Sebastien Buisson 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-4055
Reviewed-on: http://review.whamcloud.com/7842
Reviewed-by: Dmitry Eremin 
Reviewed-by: James Nunez 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/mgc/mgc_request.c |6 +-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/drivers/staging/lustre/lustre/mgc/mgc_request.c 
b/drivers/staging/lustre/lustre/mgc/mgc_request.c
index 933f6f6..2311a43 100644
--- a/drivers/staging/lustre/lustre/mgc/mgc_request.c
+++ b/drivers/staging/lustre/lustre/mgc/mgc_request.c
@@ -502,8 +502,12 @@ static void do_requeue(struct config_llog_data *cld)
 */
down_read(>cld_mgcexp->exp_obd->u.cli.cl_sem);
if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) {
+   int rc;
+
CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname);
-   mgc_process_log(cld->cld_mgcexp->exp_obd, cld);
+   rc = mgc_process_log(cld->cld_mgcexp->exp_obd, cld);
+   if (rc && rc != -ENOENT)
+   CERROR("failed processing log: %d\n", rc);
} else {
CDEBUG(D_MGC, "disconnecting, won't update log %s\n",
   cld->cld_logname);
-- 
1.7.1



mmotm 2016-04-27-15-21 uploaded

2016-04-27 Thread akpm
The mm-of-the-moment snapshot 2016-04-27-15-21 has been uploaded to

   http://www.ozlabs.org/~akpm/mmotm/

mmotm-readme.txt says

README for mm-of-the-moment:

http://www.ozlabs.org/~akpm/mmotm/

This is a snapshot of my -mm patch queue.  Uploaded at random hopefully
more than once a week.

You will need quilt to apply these patches to the latest Linus release (4.x
or 4.x-rcY).  The series file is in broken-out.tar.gz and is duplicated in
http://ozlabs.org/~akpm/mmotm/series

The file broken-out.tar.gz contains two datestamp files: .DATE and
.DATE--mm-dd-hh-mm-ss.  Both contain the string -mm-dd-hh-mm-ss,
followed by the base kernel version against which this patch series is to
be applied.

This tree is partially included in linux-next.  To see which patches are
included in linux-next, consult the `series' file.  Only the patches
within the #NEXT_PATCHES_START/#NEXT_PATCHES_END markers are included in
linux-next.

A git tree which contains the memory management portion of this tree is
maintained at git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git
by Michal Hocko.  It contains the patches which are between the
"#NEXT_PATCHES_START mm" and "#NEXT_PATCHES_END" markers, from the series
file, http://www.ozlabs.org/~akpm/mmotm/series.


A full copy of the full kernel tree with the linux-next and mmotm patches
already applied is available through git within an hour of the mmotm
release.  Individual mmotm releases are tagged.  The master branch always
points to the latest release, so it's constantly rebasing.

http://git.cmpxchg.org/cgit.cgi/linux-mmotm.git/

To develop on top of mmotm git:

  $ git remote add mmotm 
git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git
  $ git remote update mmotm
  $ git checkout -b topic mmotm/master
  
  $ git send-email mmotm/master.. [...]

To rebase a branch with older patches to a new mmotm release:

  $ git remote update mmotm
  $ git rebase --onto mmotm/master  topic




The directory http://www.ozlabs.org/~akpm/mmots/ (mm-of-the-second)
contains daily snapshots of the -mm tree.  It is updated more frequently
than mmotm, and is untested.

A git copy of this tree is available at

http://git.cmpxchg.org/cgit.cgi/linux-mmots.git/

and use of this tree is similar to
http://git.cmpxchg.org/cgit.cgi/linux-mmotm.git/, described above.


This mmotm tree contains the following patches against 4.6-rc5:
(patches marked "*" will be included in linux-next)

  origin.patch
* ksm-introduce-ksm_max_page_sharing-per-page-deduplication-limit.patch
  i-need-old-gcc.patch
  arch-alpha-kernel-systblss-remove-debug-check.patch
* kexec-update-vmcoreinfo-for-compound_order-dtor.patch
* kexec-export-offsetpagecompound_head-to-find-out-compound-tail-page.patch
* mm-exclude-hugetlb-pages-from-thp-page_mapped-logic.patch
* thp-keep-huge-zero-page-pinned-until-tlb-flush.patch
* mailmap-fix-krzysztof-kozlowskis-misspelled-name.patch
* mm-huge_memory-replace-vm_no_thp-vm_bug_on-with-actual-vma-check.patch
* numa-fix-proc-pid-numa_maps-for-thp.patch
* mm-vmscan-reclaim-highmem-zone-if-buffer_heads-is-over-limit.patch
* mm-call-swap_slot_free_notify-with-holding-page-lock.patch
* mm-hwpoison-fix-wrong-num_poisoned_pages-account.patch
* mailmap-add-frank-rowand.patch
* mm-wake-kcompactd-before-kswapds-short-sleep.patch
* kcov-dont-trace-the-code-coverage-code.patch
* kcov-dont-profile-branches-in-kcov.patch
* update-email-address.patch
* ocfs2-dlm-return-zero-if-deref_done-message-is-successfully-handled.patch
* mm-memory-failure-fix-race-with-compound-page-split-merge.patch
* rapidio-fix-potential-null-pointer-dereference.patch
* lib-stackdepotc-allow-the-stack-trace-hash-to-be-zero.patch
* mm-update-the-document-of-numa_zonelist_order.patch
* kprobes-add-the-tls-argument-for-j_do_fork.patch
* mm-thp-correct-split_huge_pages-file-permission.patch
* mm-memcontrol-let-v2-cgroups-follow-changes-in-system-swappiness.patch
* rapidio-mport_cdev-fix-uapi-type-definitions.patch
* huge-pagecache-mmap_sem-is-unlocked-when-truncation-splits-pmd.patch
* mm-update-min_free_kbytes-from-khugepaged-after-core-initialization.patch
* mm-cma-prevent-nr_isolated_-counters-from-going-negative.patch
* maintainers-fix-rajendra-nayaks-address.patch
* mm-thp-kvm-fix-memory-corruption-in-kvm-with-thp-enabled.patch
* dax-add-dax_get_unmapped_area-for-pmd-mappings.patch
* ext2-4-xfs-blk-call-dax_get_unmapped_area-for-dax-pmd-mappings.patch
* arm-arch-arm-include-asm-pageh-needs-personalityh.patch
* fsnotify-avoid-spurious-emfile-errors-from-inotify_init.patch
* fsnotify-avoid-spurious-emfile-errors-from-inotify_init-checkpatch-fixes.patch
* scripts-decode_stacktracesh-handle-symbols-in-modules.patch
* scripts-spellingtxt-add-fimware-misspelling.patch
* debugobjects-make-fixup-functions-return-bool-instead-of-int.patch
* debugobjects-correct-the-usage-of-fixup-call-results.patch
* workqueue-update-debugobjects-fixup-callbacks-return-type.patch
* 

[PATCH 09/15] staging: lustre: llite: Replace printing of i_ino with ll_inode2fid()

2016-04-27 Thread James Simmons
From: James Nunez 

The printing of i_ino/i_generation in llite messages is not nearly so
useful as printing the full inode FID, since i_ino is a "compressed"
version of the FID and there may be duplicate values for i_ino in some
cases (especially if running on a 32-bit client).

All instances of printing i_ino/i_generation are replaced with
the FID using ll_inode2fid(). All instances, except for one, of
printing just i_ino was replaced by printing the FID. In all
CERROR lines touched by the i_ino replacements, the device name
or fsname was added at the beginning of the message if it did not
already exist.

Signed-off-by: James Nunez 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-3491
Reviewed-on: http://review.whamcloud.com/6848
Reviewed-by: Andreas Dilger 
Reviewed-by: John L. Hammond 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/llite/dcache.c   |8 +-
 drivers/staging/lustre/lustre/llite/dir.c  |   23 +++---
 drivers/staging/lustre/lustre/llite/file.c |   70 +-
 drivers/staging/lustre/lustre/llite/llite_close.c  |   40 ++-
 .../staging/lustre/lustre/llite/llite_internal.h   |   10 +--
 drivers/staging/lustre/lustre/llite/llite_lib.c|   40 ++-
 drivers/staging/lustre/lustre/llite/llite_mmap.c   |6 +-
 drivers/staging/lustre/lustre/llite/llite_nfs.c|   14 +++--
 drivers/staging/lustre/lustre/llite/namei.c|   76 +---
 drivers/staging/lustre/lustre/llite/rw26.c |5 +-
 drivers/staging/lustre/lustre/llite/statahead.c|   17 ++---
 drivers/staging/lustre/lustre/llite/symlink.c  |   10 ++-
 drivers/staging/lustre/lustre/llite/vvp_dev.c  |5 +-
 drivers/staging/lustre/lustre/llite/xattr.c|   20 +++---
 14 files changed, 174 insertions(+), 170 deletions(-)

diff --git a/drivers/staging/lustre/lustre/llite/dcache.c 
b/drivers/staging/lustre/lustre/llite/dcache.c
index 5596b13..1b6f82a 100644
--- a/drivers/staging/lustre/lustre/llite/dcache.c
+++ b/drivers/staging/lustre/lustre/llite/dcache.c
@@ -250,8 +250,8 @@ void ll_invalidate_aliases(struct inode *inode)
 {
struct dentry *dentry;
 
-   CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n",
-  inode->i_ino, inode->i_generation, inode);
+   CDEBUG(D_INODE, "marking dentries for ino "DFID"(%p) invalid\n",
+  PFID(ll_inode2fid(inode)), inode);
 
ll_lock_dcache(inode);
hlist_for_each_entry(dentry, >i_dentry, d_u.d_alias) {
@@ -286,8 +286,8 @@ void ll_lookup_finish_locks(struct lookup_intent *it, 
struct inode *inode)
if (it->d.lustre.it_lock_mode && inode) {
struct ll_sb_info *sbi = ll_i2sbi(inode);
 
-   CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
-  inode, inode->i_ino, inode->i_generation);
+   CDEBUG(D_DLMTRACE, "setting l_data to inode "DFID"(%p)\n",
+  PFID(ll_inode2fid(inode)), inode);
ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
}
 
diff --git a/drivers/staging/lustre/lustre/llite/dir.c 
b/drivers/staging/lustre/lustre/llite/dir.c
index b457c28..9463da2 100644
--- a/drivers/staging/lustre/lustre/llite/dir.c
+++ b/drivers/staging/lustre/lustre/llite/dir.c
@@ -158,8 +158,8 @@ static int ll_dir_filler(void *_hash, struct page *page0)
int i;
int rc;
 
-   CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) hash %llu\n",
-  inode->i_ino, inode->i_generation, inode, hash);
+   CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) hash %llu\n",
+  PFID(ll_inode2fid(inode)), inode, hash);
 
LASSERT(max_pages > 0 && max_pages <= MD_MAX_BRW_PAGES);
 
@@ -372,8 +372,8 @@ struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
return ERR_PTR(rc);
}
 
-   CDEBUG(D_INODE, "setting lr_lvb_inode to inode %p (%lu/%u)\n",
-  dir, dir->i_ino, dir->i_generation);
+   CDEBUG(D_INODE, "setting lr_lvb_inode to inode "DFID"(%p)\n",
+  PFID(ll_inode2fid(dir)), dir);
md_set_lock_data(ll_i2sbi(dir)->ll_md_exp,
 _lock_handle, dir, NULL);
} else {
@@ -616,9 +616,9 @@ static int ll_readdir(struct file *filp, struct dir_context 
*ctx)
int api32   = ll_need_32bit_api(sbi);
int rc;
 
-   CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu 32bit_api 
%d\n",
-  inode->i_ino, inode->i_generation,
-  inode, (unsigned long)pos, i_size_read(inode), api32);
+   CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) pos %lu/%llu 32bit_api 
%d\n",
+  PFID(ll_inode2fid(inode)), inode, (unsigned long)pos,
+  i_size_read(inode), api32);
 
if (pos == MDS_DIR_END_OFF) {
/*
@@ -828,9 +828,8 @@ int ll_dir_getstripe(struct inode 

[PATCH 07/15] staging: lustre: ldlm: remove code wireshark handling

2016-04-27 Thread James Simmons
From: Bruce Korb 

Wireshark output moved to userland file "lustre_dlm_flags_wshark.c"
and only bits that can actually appear "on the wire" are emitted.
The user land "packet-lustre.c" code that references these bits
now gets emitted into that file. e.g. the "local_only" bit is
never put on the wire, so references to it in wireshark are gone.

Signed-off-by: Bruce Korb 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-2906
Reviewed-by: Keith Mannthey 
Reviewed-on: http://review.whamcloud.com/7963
Reviewed-by: Doug Oucharek 
Reviewed-by: Andreas Dilger 
Signed-off-by: James Simmons 
---
 .../lustre/lustre/include/lustre_dlm_flags.h   |   87 +---
 1 files changed, 1 insertions(+), 86 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h 
b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
index 62d3b31..1eb0cc4 100644
--- a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
+++ b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
@@ -386,90 +386,5 @@
 
 /** @} subgroup */
 /** @} group */
-#ifdef WIRESHARK_COMPILE
-static int hf_lustre_ldlm_fl_lock_changed= -1;
-static int hf_lustre_ldlm_fl_block_granted   = -1;
-static int hf_lustre_ldlm_fl_block_conv  = -1;
-static int hf_lustre_ldlm_fl_block_wait  = -1;
-static int hf_lustre_ldlm_fl_ast_sent= -1;
-static int hf_lustre_ldlm_fl_replay  = -1;
-static int hf_lustre_ldlm_fl_intent_only = -1;
-static int hf_lustre_ldlm_fl_has_intent  = -1;
-static int hf_lustre_ldlm_fl_flock_deadlock  = -1;
-static int hf_lustre_ldlm_fl_discard_data= -1;
-static int hf_lustre_ldlm_fl_no_timeout  = -1;
-static int hf_lustre_ldlm_fl_block_nowait= -1;
-static int hf_lustre_ldlm_fl_test_lock   = -1;
-static int hf_lustre_ldlm_fl_cancel_on_block = -1;
-static int hf_lustre_ldlm_fl_deny_on_contention  = -1;
-static int hf_lustre_ldlm_fl_ast_discard_data= -1;
-static int hf_lustre_ldlm_fl_fail_loc= -1;
-static int hf_lustre_ldlm_fl_skipped = -1;
-static int hf_lustre_ldlm_fl_cbpending   = -1;
-static int hf_lustre_ldlm_fl_wait_noreproc   = -1;
-static int hf_lustre_ldlm_fl_cancel  = -1;
-static int hf_lustre_ldlm_fl_local_only  = -1;
-static int hf_lustre_ldlm_fl_failed  = -1;
-static int hf_lustre_ldlm_fl_canceling   = -1;
-static int hf_lustre_ldlm_fl_local   = -1;
-static int hf_lustre_ldlm_fl_lvb_ready   = -1;
-static int hf_lustre_ldlm_fl_kms_ignore  = -1;
-static int hf_lustre_ldlm_fl_cp_reqd = -1;
-static int hf_lustre_ldlm_fl_cleaned = -1;
-static int hf_lustre_ldlm_fl_atomic_cb   = -1;
-static int hf_lustre_ldlm_fl_bl_ast  = -1;
-static int hf_lustre_ldlm_fl_bl_done = -1;
-static int hf_lustre_ldlm_fl_no_lru  = -1;
-static int hf_lustre_ldlm_fl_fail_notified   = -1;
-static int hf_lustre_ldlm_fl_destroyed   = -1;
-static int hf_lustre_ldlm_fl_server_lock = -1;
-static int hf_lustre_ldlm_fl_res_locked  = -1;
-static int hf_lustre_ldlm_fl_waited  = -1;
-static int hf_lustre_ldlm_fl_ns_srv  = -1;
-static int hf_lustre_ldlm_fl_excl= -1;
-
-const value_string lustre_ldlm_flags_vals[] = {
-   {LDLM_FL_LOCK_CHANGED,"LDLM_FL_LOCK_CHANGED"},
-   {LDLM_FL_BLOCK_GRANTED,   "LDLM_FL_BLOCK_GRANTED"},
-   {LDLM_FL_BLOCK_CONV,  "LDLM_FL_BLOCK_CONV"},
-   {LDLM_FL_BLOCK_WAIT,  "LDLM_FL_BLOCK_WAIT"},
-   {LDLM_FL_AST_SENT,"LDLM_FL_AST_SENT"},
-   {LDLM_FL_REPLAY,  "LDLM_FL_REPLAY"},
-   {LDLM_FL_INTENT_ONLY, "LDLM_FL_INTENT_ONLY"},
-   {LDLM_FL_HAS_INTENT,  "LDLM_FL_HAS_INTENT"},
-   {LDLM_FL_FLOCK_DEADLOCK,  "LDLM_FL_FLOCK_DEADLOCK"},
-   {LDLM_FL_DISCARD_DATA,"LDLM_FL_DISCARD_DATA"},
-   {LDLM_FL_NO_TIMEOUT,  "LDLM_FL_NO_TIMEOUT"},
-   {LDLM_FL_BLOCK_NOWAIT,"LDLM_FL_BLOCK_NOWAIT"},
-   {LDLM_FL_TEST_LOCK,   "LDLM_FL_TEST_LOCK"},
-   {LDLM_FL_CANCEL_ON_BLOCK, "LDLM_FL_CANCEL_ON_BLOCK"},
-   {LDLM_FL_DENY_ON_CONTENTION,  "LDLM_FL_DENY_ON_CONTENTION"},
-   {LDLM_FL_AST_DISCARD_DATA,"LDLM_FL_AST_DISCARD_DATA"},
-   {LDLM_FL_FAIL_LOC,"LDLM_FL_FAIL_LOC"},
-   {LDLM_FL_SKIPPED, "LDLM_FL_SKIPPED"},
-   {LDLM_FL_CBPENDING,   "LDLM_FL_CBPENDING"},
-   {LDLM_FL_WAIT_NOREPROC,   "LDLM_FL_WAIT_NOREPROC"},
-   {LDLM_FL_CANCEL,  "LDLM_FL_CANCEL"},
-   {LDLM_FL_LOCAL_ONLY,  "LDLM_FL_LOCAL_ONLY"},
-   {LDLM_FL_FAILED,  "LDLM_FL_FAILED"},
-   {LDLM_FL_CANCELING,   

[PATCH 07/15] staging: lustre: ldlm: remove code wireshark handling

2016-04-27 Thread James Simmons
From: Bruce Korb 

Wireshark output moved to userland file "lustre_dlm_flags_wshark.c"
and only bits that can actually appear "on the wire" are emitted.
The user land "packet-lustre.c" code that references these bits
now gets emitted into that file. e.g. the "local_only" bit is
never put on the wire, so references to it in wireshark are gone.

Signed-off-by: Bruce Korb 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-2906
Reviewed-by: Keith Mannthey 
Reviewed-on: http://review.whamcloud.com/7963
Reviewed-by: Doug Oucharek 
Reviewed-by: Andreas Dilger 
Signed-off-by: James Simmons 
---
 .../lustre/lustre/include/lustre_dlm_flags.h   |   87 +---
 1 files changed, 1 insertions(+), 86 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h 
b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
index 62d3b31..1eb0cc4 100644
--- a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
+++ b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
@@ -386,90 +386,5 @@
 
 /** @} subgroup */
 /** @} group */
-#ifdef WIRESHARK_COMPILE
-static int hf_lustre_ldlm_fl_lock_changed= -1;
-static int hf_lustre_ldlm_fl_block_granted   = -1;
-static int hf_lustre_ldlm_fl_block_conv  = -1;
-static int hf_lustre_ldlm_fl_block_wait  = -1;
-static int hf_lustre_ldlm_fl_ast_sent= -1;
-static int hf_lustre_ldlm_fl_replay  = -1;
-static int hf_lustre_ldlm_fl_intent_only = -1;
-static int hf_lustre_ldlm_fl_has_intent  = -1;
-static int hf_lustre_ldlm_fl_flock_deadlock  = -1;
-static int hf_lustre_ldlm_fl_discard_data= -1;
-static int hf_lustre_ldlm_fl_no_timeout  = -1;
-static int hf_lustre_ldlm_fl_block_nowait= -1;
-static int hf_lustre_ldlm_fl_test_lock   = -1;
-static int hf_lustre_ldlm_fl_cancel_on_block = -1;
-static int hf_lustre_ldlm_fl_deny_on_contention  = -1;
-static int hf_lustre_ldlm_fl_ast_discard_data= -1;
-static int hf_lustre_ldlm_fl_fail_loc= -1;
-static int hf_lustre_ldlm_fl_skipped = -1;
-static int hf_lustre_ldlm_fl_cbpending   = -1;
-static int hf_lustre_ldlm_fl_wait_noreproc   = -1;
-static int hf_lustre_ldlm_fl_cancel  = -1;
-static int hf_lustre_ldlm_fl_local_only  = -1;
-static int hf_lustre_ldlm_fl_failed  = -1;
-static int hf_lustre_ldlm_fl_canceling   = -1;
-static int hf_lustre_ldlm_fl_local   = -1;
-static int hf_lustre_ldlm_fl_lvb_ready   = -1;
-static int hf_lustre_ldlm_fl_kms_ignore  = -1;
-static int hf_lustre_ldlm_fl_cp_reqd = -1;
-static int hf_lustre_ldlm_fl_cleaned = -1;
-static int hf_lustre_ldlm_fl_atomic_cb   = -1;
-static int hf_lustre_ldlm_fl_bl_ast  = -1;
-static int hf_lustre_ldlm_fl_bl_done = -1;
-static int hf_lustre_ldlm_fl_no_lru  = -1;
-static int hf_lustre_ldlm_fl_fail_notified   = -1;
-static int hf_lustre_ldlm_fl_destroyed   = -1;
-static int hf_lustre_ldlm_fl_server_lock = -1;
-static int hf_lustre_ldlm_fl_res_locked  = -1;
-static int hf_lustre_ldlm_fl_waited  = -1;
-static int hf_lustre_ldlm_fl_ns_srv  = -1;
-static int hf_lustre_ldlm_fl_excl= -1;
-
-const value_string lustre_ldlm_flags_vals[] = {
-   {LDLM_FL_LOCK_CHANGED,"LDLM_FL_LOCK_CHANGED"},
-   {LDLM_FL_BLOCK_GRANTED,   "LDLM_FL_BLOCK_GRANTED"},
-   {LDLM_FL_BLOCK_CONV,  "LDLM_FL_BLOCK_CONV"},
-   {LDLM_FL_BLOCK_WAIT,  "LDLM_FL_BLOCK_WAIT"},
-   {LDLM_FL_AST_SENT,"LDLM_FL_AST_SENT"},
-   {LDLM_FL_REPLAY,  "LDLM_FL_REPLAY"},
-   {LDLM_FL_INTENT_ONLY, "LDLM_FL_INTENT_ONLY"},
-   {LDLM_FL_HAS_INTENT,  "LDLM_FL_HAS_INTENT"},
-   {LDLM_FL_FLOCK_DEADLOCK,  "LDLM_FL_FLOCK_DEADLOCK"},
-   {LDLM_FL_DISCARD_DATA,"LDLM_FL_DISCARD_DATA"},
-   {LDLM_FL_NO_TIMEOUT,  "LDLM_FL_NO_TIMEOUT"},
-   {LDLM_FL_BLOCK_NOWAIT,"LDLM_FL_BLOCK_NOWAIT"},
-   {LDLM_FL_TEST_LOCK,   "LDLM_FL_TEST_LOCK"},
-   {LDLM_FL_CANCEL_ON_BLOCK, "LDLM_FL_CANCEL_ON_BLOCK"},
-   {LDLM_FL_DENY_ON_CONTENTION,  "LDLM_FL_DENY_ON_CONTENTION"},
-   {LDLM_FL_AST_DISCARD_DATA,"LDLM_FL_AST_DISCARD_DATA"},
-   {LDLM_FL_FAIL_LOC,"LDLM_FL_FAIL_LOC"},
-   {LDLM_FL_SKIPPED, "LDLM_FL_SKIPPED"},
-   {LDLM_FL_CBPENDING,   "LDLM_FL_CBPENDING"},
-   {LDLM_FL_WAIT_NOREPROC,   "LDLM_FL_WAIT_NOREPROC"},
-   {LDLM_FL_CANCEL,  "LDLM_FL_CANCEL"},
-   {LDLM_FL_LOCAL_ONLY,  "LDLM_FL_LOCAL_ONLY"},
-   {LDLM_FL_FAILED,  "LDLM_FL_FAILED"},
-   {LDLM_FL_CANCELING,   "LDLM_FL_CANCELING"},
-   {LDLM_FL_LOCAL,   "LDLM_FL_LOCAL"},
-   {LDLM_FL_LVB_READY,   "LDLM_FL_LVB_READY"},
-   

[PATCH 13/15] staging: lustre: osc: Track number of "unstable" pages per osc

2016-04-27 Thread James Simmons
From: Prakash Surya 

This change adds simple accounting hooks for "unstable" pages on a per
OSC basis. Now, in addition to the per filesystem tracking, each OSC
will maintain a running total of its unstable pages. These counters are
exported through the proc interface, and can be read using the lctl
command.

For example:

# Read number of unstable pages contained by each OSC
lctl get_param osc.*.unstable_stats

The motivation for this change is in anticipation of implementing a
"soft sync" functionality, urging servers to commit these unstable
pages to stable storage. The per OSC accounting allows a client to
limit the soft sync request to only the OSCs which have outstanding
unstable pages.

Signed-off-by: Prakash Surya 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-2139
Reviewed-on: http://review.whamcloud.com/4374
Reviewed-by: Jinshan Xiong 
Reviewed-by: Andreas Dilger 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/include/obd.h   |1 +
 drivers/staging/lustre/lustre/ldlm/ldlm_lib.c |1 +
 drivers/staging/lustre/lustre/osc/lproc_osc.c |   18 ++
 drivers/staging/lustre/lustre/osc/osc_cache.c |6 ++
 4 files changed, 26 insertions(+), 0 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/obd.h 
b/drivers/staging/lustre/lustre/include/obd.h
index 3f24a5b..d0c0c26 100644
--- a/drivers/staging/lustre/lustre/include/obd.h
+++ b/drivers/staging/lustre/lustre/include/obd.h
@@ -325,6 +325,7 @@ struct client_obd {
atomic_t cl_lru_in_list;
struct list_head cl_lru_list; /* lru page list */
spinlock_t   cl_lru_list_lock; /* page list protector */
+   atomic_t cl_unstable_count;
 
/* number of in flight destroy rpcs is limited to max_rpcs_in_flight */
atomic_t cl_destroy_in_flight;
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c 
b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
index bc951c0..32486b2 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
@@ -334,6 +334,7 @@ int client_obd_setup(struct obd_device *obddev, struct 
lustre_cfg *lcfg)
atomic_set(>cl_lru_in_list, 0);
INIT_LIST_HEAD(>cl_lru_list);
spin_lock_init(>cl_lru_list_lock);
+   atomic_set(>cl_unstable_count, 0);
 
init_waitqueue_head(>cl_destroy_waitq);
atomic_set(>cl_destroy_in_flight, 0);
diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c 
b/drivers/staging/lustre/lustre/osc/lproc_osc.c
index 6e57f53..33a1132 100644
--- a/drivers/staging/lustre/lustre/osc/lproc_osc.c
+++ b/drivers/staging/lustre/lustre/osc/lproc_osc.c
@@ -593,6 +593,23 @@ static ssize_t max_pages_per_rpc_store(struct kobject 
*kobj,
 }
 LUSTRE_RW_ATTR(max_pages_per_rpc);
 
+static ssize_t unstable_stats_show(struct kobject *kobj,
+  struct attribute *attr,
+  char *buf)
+{
+   struct obd_device *dev = container_of(kobj, struct obd_device,
+ obd_kobj);
+   struct client_obd *cli = >u.cli;
+   int pages, mb;
+
+   pages = atomic_read(>cl_unstable_count);
+   mb = (pages * PAGE_SIZE) >> 20;
+
+   return sprintf(buf, "unstable_pages: %8d\n"
+  "unstable_mb:%8d\n", pages, mb);
+}
+LUSTRE_RO_ATTR(unstable_stats);
+
 LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags);
 LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid);
 LPROC_SEQ_FOPS_RO_TYPE(osc, conn_uuid);
@@ -802,6 +819,7 @@ static struct attribute *osc_attrs[] = {
_attr_max_pages_per_rpc.attr,
_attr_max_rpcs_in_flight.attr,
_attr_resend_count.attr,
+   _attr_unstable_stats.attr,
NULL,
 };
 
diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c 
b/drivers/staging/lustre/lustre/osc/osc_cache.c
index 5cd8eef..7d1c2c5 100644
--- a/drivers/staging/lustre/lustre/osc/osc_cache.c
+++ b/drivers/staging/lustre/lustre/osc/osc_cache.c
@@ -1873,6 +1873,9 @@ void osc_dec_unstable_pages(struct ptlrpc_request *req)
atomic_sub(page_count, >cl_cache->ccc_unstable_nr);
LASSERT(atomic_read(>cl_cache->ccc_unstable_nr) >= 0);
 
+   atomic_sub(page_count, >cl_unstable_count);
+   LASSERT(atomic_read(>cl_unstable_count) >= 0);
+
atomic_sub(page_count, _unstable_pages);
LASSERT(atomic_read(_unstable_pages) >= 0);
 
@@ -1904,6 +1907,9 @@ void osc_inc_unstable_pages(struct ptlrpc_request *req)
LASSERT(atomic_read(>cl_cache->ccc_unstable_nr) >= 0);
atomic_add(page_count, >cl_cache->ccc_unstable_nr);
 
+   LASSERT(atomic_read(>cl_unstable_count) >= 0);
+   atomic_add(page_count, >cl_unstable_count);
+
LASSERT(atomic_read(_unstable_pages) >= 0);

[PATCH 15/15] staging: lustre: ptlrpc: quiet warning for 2.1/2.5 connections

2016-04-27 Thread James Simmons
From: Andreas Dilger 

The Lustre 2.5.4 client will print a warning about connections with
2.1.3 servers, yet they are still supposed to be interoperable.
Increase the window of warning to be up to Lustre 2.5.50, since we
do not intend to allow interoperability between 2.1 and 2.6 systems.
This was from the time when major releases like Lustre 1.4 and 1.8
were many years apart would have no chance for interoperation.

Only print this message once per client, to avoid flooding the console
for connections to many servers or frequent network reconnections.
Server versions should all be nearly the same in any case.

Signed-off-by: Andreas Dilger 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-2528
Reviewed-on: http://review.whamcloud.com/7916
Reviewed-by: Bob Glossman 
Reviewed-by: Jian Yu 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/ptlrpc/import.c |   11 ++-
 1 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/staging/lustre/lustre/ptlrpc/import.c 
b/drivers/staging/lustre/lustre/ptlrpc/import.c
index bf7b9d2..a4f7544 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/import.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/import.c
@@ -1001,6 +1001,8 @@ finish:
return 0;
}
} else {
+   static bool warned;
+
spin_lock(>imp_lock);
list_del(>imp_conn_current->oic_item);
list_add(>imp_conn_current->oic_item, >imp_conn_list);
@@ -1020,7 +1022,7 @@ finish:
goto out;
}
 
-   if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+   if (!warned && (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
(ocd->ocd_version > LUSTRE_VERSION_CODE +
LUSTRE_VERSION_OFFSET_WARN ||
 ocd->ocd_version < LUSTRE_VERSION_CODE -
@@ -1028,10 +1030,8 @@ finish:
/* Sigh, some compilers do not like #ifdef in the middle
 * of macro arguments
 */
-   const char *older = "older. Consider upgrading server 
or downgrading client"
-   ;
-   const char *newer = "newer than client version. 
Consider upgrading client"
-   ;
+   const char *older = "older than client. Consider 
upgrading server";
+   const char *newer = "newer than client. Consider 
recompiling application";
 
LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) is much 
%s (%s)\n",
  obd2cli_tgt(imp->imp_obd),
@@ -1041,6 +1041,7 @@ finish:
  OBD_OCD_VERSION_FIX(ocd->ocd_version),
  ocd->ocd_version > LUSTRE_VERSION_CODE ?
  newer : older, LUSTRE_VERSION_STRING);
+   warned = true;
}
 
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
-- 
1.7.1



[PATCH 10/15] staging: lustre: clio: add debug message in osc_completion()

2016-04-27 Thread James Simmons
From: Niu Yawei 

Replace LASSERT with LASSERTF in osc_completion, thus we can get
more info when the LASSERT is triggered.

Signed-off-by: Niu Yawei 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-3843
Reviewed-on: http://review.whamcloud.com/7494
Reviewed-by: Bobi Jam 
Reviewed-by: Jinshan Xiong 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/osc/osc_cache.c |6 --
 1 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c 
b/drivers/staging/lustre/lustre/osc/osc_cache.c
index dccd309..de28e42 100644
--- a/drivers/staging/lustre/lustre/osc/osc_cache.c
+++ b/drivers/staging/lustre/lustre/osc/osc_cache.c
@@ -1333,8 +1333,10 @@ static int osc_completion(const struct lu_env *env, 
struct osc_async_page *oap,
int srvlock;
 
cmd &= ~OBD_BRW_NOQUOTA;
-   LASSERT(equi(page->cp_state == CPS_PAGEIN,  cmd == OBD_BRW_READ));
-   LASSERT(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE));
+   LASSERTF(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ),
+"cp_state:%u, cmd:%d\n", page->cp_state, cmd);
+   LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE),
+"cp_state:%u, cmd:%d\n", page->cp_state, cmd);
LASSERT(opg->ops_transfer_pinned);
 
/*
-- 
1.7.1



[PATCH] arm64: Relocate screen_info.lfb_base on PCI BAR allocation

2016-04-27 Thread Alexander Graf
When booting with efifb, we get a frame buffer address passed into the system.
This address can be backed by any device, including PCI devices.

PCI devices can have their BARs mapped to various places inside the PCI window
though. Linux makes use of that on early boot and usually maps PCI BARs wherever
it thinks makes sense.

If we now load the efifb driver after that BAR map has happened, the frame
buffer address we received may be invalid, because it was in a BAR map before
Linux modified it.

To work around that issue, this patch introduces a BAR mapping callback that
gets called every time Linux (re)allocates a BAR. That way our arm64 efi code
can check whether the frame buffer is inside the old map and adjust it to
the new one.

With this and the efifb patches applied, I can successfully see efifb output
even after Linux remapped BARs.

Signed-off-by: Alexander Graf 
---
 arch/arm64/kernel/efi.c | 40 +++-
 drivers/pci/setup-res.c | 29 +
 include/linux/pci.h |  8 
 3 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 56a76b6..3612110 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -213,6 +214,41 @@ static __init void reserve_regions(void)
set_bit(EFI_MEMMAP, );
 }
 
+#ifdef CONFIG_PCI
+static bool efi_pci_overlaps_efifb(struct pci_bar_update_info *update_info)
+{
+   /* is the screen_info frame buffer inside the pci BAR? */
+   if (screen_info.lfb_base >= update_info->old_start &&
+   (screen_info.lfb_base + screen_info.lfb_size) <=
+(update_info->old_start + update_info->size))
+   return true;
+
+   return false;
+}
+
+static int efi_pci_notifier(struct notifier_block *self,
+   unsigned long cmd, void *v)
+{
+   struct pci_bar_update_info *update_info = v;
+
+   /*
+* When we reallocate a BAR that contains our frame buffer, set the
+* screen_info base to where it belongs
+*/
+   if (efi_pci_overlaps_efifb(update_info)) {
+   u64 diff = (update_info->new_start - update_info->old_start);
+   screen_info.lfb_base += diff;
+   }
+
+   return NOTIFY_OK;
+}
+static struct notifier_block efi_pci_notifier_block = {
+   .notifier_call = efi_pci_notifier,
+};
+#else
+#define pci_notify_on_update_resource(a)
+#endif
+
 void __init efi_init_fdt(void *fdt)
 {
struct efi_fdt_params params;
@@ -246,8 +282,10 @@ void __init efi_init_fdt(void *fdt)
reserve_regions();
early_memunmap(memmap.map, params.mmap_size);
 
-   if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI)
+   if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI) {
+   pci_notify_on_update_resource(_pci_notifier_block);
memblock_reserve(screen_info.lfb_base, screen_info.lfb_size);
+   }
 }
 
 static int __init register_gop_device(void)
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index 604011e..d5c24fc 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -23,8 +23,10 @@
 #include 
 #include 
 #include 
+#include 
 #include "pci.h"
 
+static RAW_NOTIFIER_HEAD(bar_update_chain);
 
 void pci_update_resource(struct pci_dev *dev, int resno)
 {
@@ -35,6 +37,9 @@ void pci_update_resource(struct pci_dev *dev, int resno)
int reg;
enum pci_bar_type type;
struct resource *res = dev->resource + resno;
+   struct pci_bar_update_info update_info;
+   struct pci_bus_region update_reg;
+   struct resource update_res;
 
if (dev->is_virtfn) {
dev_warn(>dev, "can't update VF BAR%d\n", resno);
@@ -77,6 +82,22 @@ void pci_update_resource(struct pci_dev *dev, int resno)
}
 
/*
+* Fetch the old BAR location from the device, so we can notify
+* users of that BAR that its location is changing.
+*/
+   pci_read_config_dword(dev, reg, );
+   update_reg.start = check & PCI_BASE_ADDRESS_MEM_MASK;
+   if (check & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+   pci_read_config_dword(dev, reg, );
+   update_reg.start |= ((u64)check) << 32;
+   }
+   update_info.size = region.end - region.start;
+   update_reg.end = update_reg.start + update_info.size;
+   pcibios_bus_to_resource(dev->bus, _res, _reg);
+   update_info.old_start = update_res.start;
+   update_info.new_start = res->start;
+
+   /*
 * We can't update a 64-bit BAR atomically, so when possible,
 * disable decoding so that a half-updated BAR won't conflict
 * with another device.
@@ -108,6 +129,14 @@ void pci_update_resource(struct pci_dev *dev, int resno)
 
if (disable)
pci_write_config_word(dev, PCI_COMMAND, cmd);
+
+   

[PATCH 10/15] staging: lustre: clio: add debug message in osc_completion()

2016-04-27 Thread James Simmons
From: Niu Yawei 

Replace LASSERT with LASSERTF in osc_completion, thus we can get
more info when the LASSERT is triggered.

Signed-off-by: Niu Yawei 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-3843
Reviewed-on: http://review.whamcloud.com/7494
Reviewed-by: Bobi Jam 
Reviewed-by: Jinshan Xiong 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/osc/osc_cache.c |6 --
 1 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c 
b/drivers/staging/lustre/lustre/osc/osc_cache.c
index dccd309..de28e42 100644
--- a/drivers/staging/lustre/lustre/osc/osc_cache.c
+++ b/drivers/staging/lustre/lustre/osc/osc_cache.c
@@ -1333,8 +1333,10 @@ static int osc_completion(const struct lu_env *env, 
struct osc_async_page *oap,
int srvlock;
 
cmd &= ~OBD_BRW_NOQUOTA;
-   LASSERT(equi(page->cp_state == CPS_PAGEIN,  cmd == OBD_BRW_READ));
-   LASSERT(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE));
+   LASSERTF(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ),
+"cp_state:%u, cmd:%d\n", page->cp_state, cmd);
+   LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE),
+"cp_state:%u, cmd:%d\n", page->cp_state, cmd);
LASSERT(opg->ops_transfer_pinned);
 
/*
-- 
1.7.1



[PATCH] arm64: Relocate screen_info.lfb_base on PCI BAR allocation

2016-04-27 Thread Alexander Graf
When booting with efifb, we get a frame buffer address passed into the system.
This address can be backed by any device, including PCI devices.

PCI devices can have their BARs mapped to various places inside the PCI window
though. Linux makes use of that on early boot and usually maps PCI BARs wherever
it thinks makes sense.

If we now load the efifb driver after that BAR map has happened, the frame
buffer address we received may be invalid, because it was in a BAR map before
Linux modified it.

To work around that issue, this patch introduces a BAR mapping callback that
gets called every time Linux (re)allocates a BAR. That way our arm64 efi code
can check whether the frame buffer is inside the old map and adjust it to
the new one.

With this and the efifb patches applied, I can successfully see efifb output
even after Linux remapped BARs.

Signed-off-by: Alexander Graf 
---
 arch/arm64/kernel/efi.c | 40 +++-
 drivers/pci/setup-res.c | 29 +
 include/linux/pci.h |  8 
 3 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 56a76b6..3612110 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -213,6 +214,41 @@ static __init void reserve_regions(void)
set_bit(EFI_MEMMAP, );
 }
 
+#ifdef CONFIG_PCI
+static bool efi_pci_overlaps_efifb(struct pci_bar_update_info *update_info)
+{
+   /* is the screen_info frame buffer inside the pci BAR? */
+   if (screen_info.lfb_base >= update_info->old_start &&
+   (screen_info.lfb_base + screen_info.lfb_size) <=
+(update_info->old_start + update_info->size))
+   return true;
+
+   return false;
+}
+
+static int efi_pci_notifier(struct notifier_block *self,
+   unsigned long cmd, void *v)
+{
+   struct pci_bar_update_info *update_info = v;
+
+   /*
+* When we reallocate a BAR that contains our frame buffer, set the
+* screen_info base to where it belongs
+*/
+   if (efi_pci_overlaps_efifb(update_info)) {
+   u64 diff = (update_info->new_start - update_info->old_start);
+   screen_info.lfb_base += diff;
+   }
+
+   return NOTIFY_OK;
+}
+static struct notifier_block efi_pci_notifier_block = {
+   .notifier_call = efi_pci_notifier,
+};
+#else
+#define pci_notify_on_update_resource(a)
+#endif
+
 void __init efi_init_fdt(void *fdt)
 {
struct efi_fdt_params params;
@@ -246,8 +282,10 @@ void __init efi_init_fdt(void *fdt)
reserve_regions();
early_memunmap(memmap.map, params.mmap_size);
 
-   if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI)
+   if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI) {
+   pci_notify_on_update_resource(_pci_notifier_block);
memblock_reserve(screen_info.lfb_base, screen_info.lfb_size);
+   }
 }
 
 static int __init register_gop_device(void)
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index 604011e..d5c24fc 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -23,8 +23,10 @@
 #include 
 #include 
 #include 
+#include 
 #include "pci.h"
 
+static RAW_NOTIFIER_HEAD(bar_update_chain);
 
 void pci_update_resource(struct pci_dev *dev, int resno)
 {
@@ -35,6 +37,9 @@ void pci_update_resource(struct pci_dev *dev, int resno)
int reg;
enum pci_bar_type type;
struct resource *res = dev->resource + resno;
+   struct pci_bar_update_info update_info;
+   struct pci_bus_region update_reg;
+   struct resource update_res;
 
if (dev->is_virtfn) {
dev_warn(>dev, "can't update VF BAR%d\n", resno);
@@ -77,6 +82,22 @@ void pci_update_resource(struct pci_dev *dev, int resno)
}
 
/*
+* Fetch the old BAR location from the device, so we can notify
+* users of that BAR that its location is changing.
+*/
+   pci_read_config_dword(dev, reg, );
+   update_reg.start = check & PCI_BASE_ADDRESS_MEM_MASK;
+   if (check & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+   pci_read_config_dword(dev, reg, );
+   update_reg.start |= ((u64)check) << 32;
+   }
+   update_info.size = region.end - region.start;
+   update_reg.end = update_reg.start + update_info.size;
+   pcibios_bus_to_resource(dev->bus, _res, _reg);
+   update_info.old_start = update_res.start;
+   update_info.new_start = res->start;
+
+   /*
 * We can't update a 64-bit BAR atomically, so when possible,
 * disable decoding so that a half-updated BAR won't conflict
 * with another device.
@@ -108,6 +129,14 @@ void pci_update_resource(struct pci_dev *dev, int resno)
 
if (disable)
pci_write_config_word(dev, PCI_COMMAND, cmd);
+
+   /* Tell 

[PATCH 13/15] staging: lustre: osc: Track number of "unstable" pages per osc

2016-04-27 Thread James Simmons
From: Prakash Surya 

This change adds simple accounting hooks for "unstable" pages on a per
OSC basis. Now, in addition to the per filesystem tracking, each OSC
will maintain a running total of its unstable pages. These counters are
exported through the proc interface, and can be read using the lctl
command.

For example:

# Read number of unstable pages contained by each OSC
lctl get_param osc.*.unstable_stats

The motivation for this change is in anticipation of implementing a
"soft sync" functionality, urging servers to commit these unstable
pages to stable storage. The per OSC accounting allows a client to
limit the soft sync request to only the OSCs which have outstanding
unstable pages.

Signed-off-by: Prakash Surya 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-2139
Reviewed-on: http://review.whamcloud.com/4374
Reviewed-by: Jinshan Xiong 
Reviewed-by: Andreas Dilger 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/include/obd.h   |1 +
 drivers/staging/lustre/lustre/ldlm/ldlm_lib.c |1 +
 drivers/staging/lustre/lustre/osc/lproc_osc.c |   18 ++
 drivers/staging/lustre/lustre/osc/osc_cache.c |6 ++
 4 files changed, 26 insertions(+), 0 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/obd.h 
b/drivers/staging/lustre/lustre/include/obd.h
index 3f24a5b..d0c0c26 100644
--- a/drivers/staging/lustre/lustre/include/obd.h
+++ b/drivers/staging/lustre/lustre/include/obd.h
@@ -325,6 +325,7 @@ struct client_obd {
atomic_t cl_lru_in_list;
struct list_head cl_lru_list; /* lru page list */
spinlock_t   cl_lru_list_lock; /* page list protector */
+   atomic_t cl_unstable_count;
 
/* number of in flight destroy rpcs is limited to max_rpcs_in_flight */
atomic_t cl_destroy_in_flight;
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c 
b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
index bc951c0..32486b2 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
@@ -334,6 +334,7 @@ int client_obd_setup(struct obd_device *obddev, struct 
lustre_cfg *lcfg)
atomic_set(>cl_lru_in_list, 0);
INIT_LIST_HEAD(>cl_lru_list);
spin_lock_init(>cl_lru_list_lock);
+   atomic_set(>cl_unstable_count, 0);
 
init_waitqueue_head(>cl_destroy_waitq);
atomic_set(>cl_destroy_in_flight, 0);
diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c 
b/drivers/staging/lustre/lustre/osc/lproc_osc.c
index 6e57f53..33a1132 100644
--- a/drivers/staging/lustre/lustre/osc/lproc_osc.c
+++ b/drivers/staging/lustre/lustre/osc/lproc_osc.c
@@ -593,6 +593,23 @@ static ssize_t max_pages_per_rpc_store(struct kobject 
*kobj,
 }
 LUSTRE_RW_ATTR(max_pages_per_rpc);
 
+static ssize_t unstable_stats_show(struct kobject *kobj,
+  struct attribute *attr,
+  char *buf)
+{
+   struct obd_device *dev = container_of(kobj, struct obd_device,
+ obd_kobj);
+   struct client_obd *cli = >u.cli;
+   int pages, mb;
+
+   pages = atomic_read(>cl_unstable_count);
+   mb = (pages * PAGE_SIZE) >> 20;
+
+   return sprintf(buf, "unstable_pages: %8d\n"
+  "unstable_mb:%8d\n", pages, mb);
+}
+LUSTRE_RO_ATTR(unstable_stats);
+
 LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags);
 LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid);
 LPROC_SEQ_FOPS_RO_TYPE(osc, conn_uuid);
@@ -802,6 +819,7 @@ static struct attribute *osc_attrs[] = {
_attr_max_pages_per_rpc.attr,
_attr_max_rpcs_in_flight.attr,
_attr_resend_count.attr,
+   _attr_unstable_stats.attr,
NULL,
 };
 
diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c 
b/drivers/staging/lustre/lustre/osc/osc_cache.c
index 5cd8eef..7d1c2c5 100644
--- a/drivers/staging/lustre/lustre/osc/osc_cache.c
+++ b/drivers/staging/lustre/lustre/osc/osc_cache.c
@@ -1873,6 +1873,9 @@ void osc_dec_unstable_pages(struct ptlrpc_request *req)
atomic_sub(page_count, >cl_cache->ccc_unstable_nr);
LASSERT(atomic_read(>cl_cache->ccc_unstable_nr) >= 0);
 
+   atomic_sub(page_count, >cl_unstable_count);
+   LASSERT(atomic_read(>cl_unstable_count) >= 0);
+
atomic_sub(page_count, _unstable_pages);
LASSERT(atomic_read(_unstable_pages) >= 0);
 
@@ -1904,6 +1907,9 @@ void osc_inc_unstable_pages(struct ptlrpc_request *req)
LASSERT(atomic_read(>cl_cache->ccc_unstable_nr) >= 0);
atomic_add(page_count, >cl_cache->ccc_unstable_nr);
 
+   LASSERT(atomic_read(>cl_unstable_count) >= 0);
+   atomic_add(page_count, >cl_unstable_count);
+
LASSERT(atomic_read(_unstable_pages) >= 0);
atomic_add(page_count, _unstable_pages);
 
-- 
1.7.1



[PATCH 15/15] staging: lustre: ptlrpc: quiet warning for 2.1/2.5 connections

2016-04-27 Thread James Simmons
From: Andreas Dilger 

The Lustre 2.5.4 client will print a warning about connections with
2.1.3 servers, yet they are still supposed to be interoperable.
Increase the window of warning to be up to Lustre 2.5.50, since we
do not intend to allow interoperability between 2.1 and 2.6 systems.
This was from the time when major releases like Lustre 1.4 and 1.8
were many years apart would have no chance for interoperation.

Only print this message once per client, to avoid flooding the console
for connections to many servers or frequent network reconnections.
Server versions should all be nearly the same in any case.

Signed-off-by: Andreas Dilger 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-2528
Reviewed-on: http://review.whamcloud.com/7916
Reviewed-by: Bob Glossman 
Reviewed-by: Jian Yu 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/ptlrpc/import.c |   11 ++-
 1 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/staging/lustre/lustre/ptlrpc/import.c 
b/drivers/staging/lustre/lustre/ptlrpc/import.c
index bf7b9d2..a4f7544 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/import.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/import.c
@@ -1001,6 +1001,8 @@ finish:
return 0;
}
} else {
+   static bool warned;
+
spin_lock(>imp_lock);
list_del(>imp_conn_current->oic_item);
list_add(>imp_conn_current->oic_item, >imp_conn_list);
@@ -1020,7 +1022,7 @@ finish:
goto out;
}
 
-   if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+   if (!warned && (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
(ocd->ocd_version > LUSTRE_VERSION_CODE +
LUSTRE_VERSION_OFFSET_WARN ||
 ocd->ocd_version < LUSTRE_VERSION_CODE -
@@ -1028,10 +1030,8 @@ finish:
/* Sigh, some compilers do not like #ifdef in the middle
 * of macro arguments
 */
-   const char *older = "older. Consider upgrading server 
or downgrading client"
-   ;
-   const char *newer = "newer than client version. 
Consider upgrading client"
-   ;
+   const char *older = "older than client. Consider 
upgrading server";
+   const char *newer = "newer than client. Consider 
recompiling application";
 
LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) is much 
%s (%s)\n",
  obd2cli_tgt(imp->imp_obd),
@@ -1041,6 +1041,7 @@ finish:
  OBD_OCD_VERSION_FIX(ocd->ocd_version),
  ocd->ocd_version > LUSTRE_VERSION_CODE ?
  newer : older, LUSTRE_VERSION_STRING);
+   warned = true;
}
 
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
-- 
1.7.1



[PATCH 14/15] staging: lustre: osc: Use SOFT_SYNC to urge server commit

2016-04-27 Thread James Simmons
From: Prakash Surya 

This change adds a BRW page flag, OBD_BRW_SOFT_SYNC. This flag is
intended to urge a server to commit a client's unstable pages to
stable storage. A client will add this flag to any BRW requests while
it is in a state where it has "many" unstable pages pinned in its cache.

The server, upon receiving a page with this flag set, *should* begin
an async commit. The idea being that, with the proactive async commit,
the client's pinned unstable pages will transition into a stable state
faster than they would have otherwise. Although, the server side portion
of this agreement is still unimplemented, so the OBD_BRW_SOFT_SYNC flag
will currently fall on deaf ears.

Signed-off-by: Prakash Surya 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-2139
Reviewed-on: http://review.whamcloud.com/4375
Reviewed-by: Jinshan Xiong 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 .../lustre/lustre/include/lustre/lustre_idl.h  |5 +++
 drivers/staging/lustre/lustre/osc/osc_cache.c  |3 ++
 drivers/staging/lustre/lustre/osc/osc_internal.h   |1 +
 drivers/staging/lustre/lustre/osc/osc_page.c   |   29 
 drivers/staging/lustre/lustre/osc/osc_request.c|3 +-
 drivers/staging/lustre/lustre/ptlrpc/wiretest.c|6 
 6 files changed, 46 insertions(+), 1 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h 
b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
index c3565bf..26819ee 100644
--- a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
@@ -1734,6 +1734,11 @@ void lustre_swab_obd_statfs(struct obd_statfs *os);
 #define OBD_BRW_MEMALLOC   0x800 /* Client runs in the "kswapd" context */
 #define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */
 #define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */
+#define OBD_BRW_SOFT_SYNC 0x4000 /* This flag notifies the server
+ * that the client is running low on
+ * space for unstable pages; asking
+ * it to sync quickly
+ */
 
 #define OBD_OBJECT_EOF 0xULL
 
diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c 
b/drivers/staging/lustre/lustre/osc/osc_cache.c
index 7d1c2c5..5a14bea 100644
--- a/drivers/staging/lustre/lustre/osc/osc_cache.c
+++ b/drivers/staging/lustre/lustre/osc/osc_cache.c
@@ -2437,6 +2437,9 @@ int osc_queue_async_io(const struct lu_env *env, struct 
cl_io *io,
return rc;
}
 
+   if (osc_over_unstable_soft_limit(cli))
+   brw_flags |= OBD_BRW_SOFT_SYNC;
+
oap->oap_cmd = cmd;
oap->oap_page_off = ops->ops_from;
oap->oap_count = ops->ops_to - ops->ops_from;
diff --git a/drivers/staging/lustre/lustre/osc/osc_internal.h 
b/drivers/staging/lustre/lustre/osc/osc_internal.h
index 39e6138..7fad827 100644
--- a/drivers/staging/lustre/lustre/osc/osc_internal.h
+++ b/drivers/staging/lustre/lustre/osc/osc_internal.h
@@ -202,6 +202,7 @@ int osc_quotacheck(struct obd_device *unused, struct 
obd_export *exp,
 int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk);
 void osc_inc_unstable_pages(struct ptlrpc_request *req);
 void osc_dec_unstable_pages(struct ptlrpc_request *req);
+int  osc_over_unstable_soft_limit(struct client_obd *cli);
 
 struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
   struct osc_object *obj, pgoff_t index,
diff --git a/drivers/staging/lustre/lustre/osc/osc_page.c 
b/drivers/staging/lustre/lustre/osc/osc_page.c
index 5ec5508..c29c2ea 100644
--- a/drivers/staging/lustre/lustre/osc/osc_page.c
+++ b/drivers/staging/lustre/lustre/osc/osc_page.c
@@ -346,6 +346,32 @@ int osc_page_init(const struct lu_env *env, struct 
cl_object *obj,
return result;
 }
 
+int osc_over_unstable_soft_limit(struct client_obd *cli)
+{
+   long obd_upages, obd_dpages, osc_upages;
+
+   /* Can't check cli->cl_unstable_count, therefore, no soft limit */
+   if (!cli)
+   return 0;
+
+   obd_upages = atomic_read(_unstable_pages);
+   obd_dpages = atomic_read(_dirty_pages);
+
+   osc_upages = atomic_read(>cl_unstable_count);
+
+   /*
+* obd_max_dirty_pages is the max number of (dirty + unstable)
+* pages allowed at any given time. To simulate an unstable page
+* only limit, we subtract the current number of dirty pages
+* from this max. This difference is roughly the amount of pages
+* currently available for unstable pages. Thus, the soft limit
+* is half of that difference. Check osc_upages to ensure we don't
+* set SOFT_SYNC for OSCs without any 

[PATCH 14/15] staging: lustre: osc: Use SOFT_SYNC to urge server commit

2016-04-27 Thread James Simmons
From: Prakash Surya 

This change adds a BRW page flag, OBD_BRW_SOFT_SYNC. This flag is
intended to urge a server to commit a client's unstable pages to
stable storage. A client will add this flag to any BRW requests while
it is in a state where it has "many" unstable pages pinned in its cache.

The server, upon receiving a page with this flag set, *should* begin
an async commit. The idea being that, with the proactive async commit,
the client's pinned unstable pages will transition into a stable state
faster than they would have otherwise. Although, the server side portion
of this agreement is still unimplemented, so the OBD_BRW_SOFT_SYNC flag
will currently fall on deaf ears.

Signed-off-by: Prakash Surya 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-2139
Reviewed-on: http://review.whamcloud.com/4375
Reviewed-by: Jinshan Xiong 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 .../lustre/lustre/include/lustre/lustre_idl.h  |5 +++
 drivers/staging/lustre/lustre/osc/osc_cache.c  |3 ++
 drivers/staging/lustre/lustre/osc/osc_internal.h   |1 +
 drivers/staging/lustre/lustre/osc/osc_page.c   |   29 
 drivers/staging/lustre/lustre/osc/osc_request.c|3 +-
 drivers/staging/lustre/lustre/ptlrpc/wiretest.c|6 
 6 files changed, 46 insertions(+), 1 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h 
b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
index c3565bf..26819ee 100644
--- a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
@@ -1734,6 +1734,11 @@ void lustre_swab_obd_statfs(struct obd_statfs *os);
 #define OBD_BRW_MEMALLOC   0x800 /* Client runs in the "kswapd" context */
 #define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */
 #define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */
+#define OBD_BRW_SOFT_SYNC 0x4000 /* This flag notifies the server
+ * that the client is running low on
+ * space for unstable pages; asking
+ * it to sync quickly
+ */
 
 #define OBD_OBJECT_EOF 0xULL
 
diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c 
b/drivers/staging/lustre/lustre/osc/osc_cache.c
index 7d1c2c5..5a14bea 100644
--- a/drivers/staging/lustre/lustre/osc/osc_cache.c
+++ b/drivers/staging/lustre/lustre/osc/osc_cache.c
@@ -2437,6 +2437,9 @@ int osc_queue_async_io(const struct lu_env *env, struct 
cl_io *io,
return rc;
}
 
+   if (osc_over_unstable_soft_limit(cli))
+   brw_flags |= OBD_BRW_SOFT_SYNC;
+
oap->oap_cmd = cmd;
oap->oap_page_off = ops->ops_from;
oap->oap_count = ops->ops_to - ops->ops_from;
diff --git a/drivers/staging/lustre/lustre/osc/osc_internal.h 
b/drivers/staging/lustre/lustre/osc/osc_internal.h
index 39e6138..7fad827 100644
--- a/drivers/staging/lustre/lustre/osc/osc_internal.h
+++ b/drivers/staging/lustre/lustre/osc/osc_internal.h
@@ -202,6 +202,7 @@ int osc_quotacheck(struct obd_device *unused, struct 
obd_export *exp,
 int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk);
 void osc_inc_unstable_pages(struct ptlrpc_request *req);
 void osc_dec_unstable_pages(struct ptlrpc_request *req);
+int  osc_over_unstable_soft_limit(struct client_obd *cli);
 
 struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
   struct osc_object *obj, pgoff_t index,
diff --git a/drivers/staging/lustre/lustre/osc/osc_page.c 
b/drivers/staging/lustre/lustre/osc/osc_page.c
index 5ec5508..c29c2ea 100644
--- a/drivers/staging/lustre/lustre/osc/osc_page.c
+++ b/drivers/staging/lustre/lustre/osc/osc_page.c
@@ -346,6 +346,32 @@ int osc_page_init(const struct lu_env *env, struct 
cl_object *obj,
return result;
 }
 
+int osc_over_unstable_soft_limit(struct client_obd *cli)
+{
+   long obd_upages, obd_dpages, osc_upages;
+
+   /* Can't check cli->cl_unstable_count, therefore, no soft limit */
+   if (!cli)
+   return 0;
+
+   obd_upages = atomic_read(_unstable_pages);
+   obd_dpages = atomic_read(_dirty_pages);
+
+   osc_upages = atomic_read(>cl_unstable_count);
+
+   /*
+* obd_max_dirty_pages is the max number of (dirty + unstable)
+* pages allowed at any given time. To simulate an unstable page
+* only limit, we subtract the current number of dirty pages
+* from this max. This difference is roughly the amount of pages
+* currently available for unstable pages. Thus, the soft limit
+* is half of that difference. Check osc_upages to ensure we don't
+* set SOFT_SYNC for OSCs without any outstanding unstable pages.
+*/
+   return osc_upages &&
+  obd_upages >= 

[PATCH 12/15] staging: lustre: osc: Track and limit "unstable" pages

2016-04-27 Thread James Simmons
From: Prakash Surya 

This change adds a global counter to track the number of "unstable"
pages held by a given client, along with per file system counters. An
"unstable" page is defined as a page which has been sent to the server
as part of a bulk request, but is uncommitted to stable storage.

In addition to simply tracking the unstable pages, they now also count
towards the maximum number of "pinned" pages on the system at any given
time. Thus, a client will now be bound on the number of dirty and
unstable pages it can pin in memory. Previously only dirty pages were
accounted for in this limit.

In addition to tracking the number of unstable pages in Lustre, the
NR_UNSTABLE_NFS memory zone is also incremented and decremented for
easy monitoring using the "NFS_Unstable:" field in /proc/meminfo.
This field is also used internally by the kernel to limit the total
amount of unstable pages on the system.

The motivation for this change is twofold. First, the client must not
allow itself to disconnect from an OST while still holding unstable
pages. Otherwise, these unstable pages can get lost due to an OST
failure, and replay is not possible due to the disconnect via unmount.

Secondly, the client needs a mechanism to prevent it from allocating too
much of its available RAM to unreclaimable pages pinned by the ptlrpc
layer. If this case occurs, out of memory events can trigger as a side
effect, which we need to avoid.

The current number of unstable pages accounted for on a per file system
granularity is exported by the unstable_stats proc file, contained under
each file system's llite namespace. An example of retrieving this
information is below:

$ lctl get_param llite.*.unstable_stats

Signed-off-by: Prakash Surya 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-2139
Reviewed-on: http://review.whamcloud.com/6284
Reviewed-by: Jinshan Xiong 
Reviewed-by: Andreas Dilger 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/include/cl_object.h  |   10 ++
 drivers/staging/lustre/lustre/include/lustre_net.h |4 +-
 drivers/staging/lustre/lustre/include/obd.h|2 +-
 .../staging/lustre/lustre/include/obd_support.h|1 +
 .../staging/lustre/lustre/llite/llite_internal.h   |6 +
 drivers/staging/lustre/lustre/llite/llite_lib.c|   20 -
 drivers/staging/lustre/lustre/llite/lproc_llite.c  |   18 
 drivers/staging/lustre/lustre/obdclass/class_obd.c |2 +
 drivers/staging/lustre/lustre/osc/osc_cache.c  |   99 +++-
 drivers/staging/lustre/lustre/osc/osc_internal.h   |3 +
 drivers/staging/lustre/lustre/osc/osc_request.c|   28 +-
 11 files changed, 182 insertions(+), 11 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/cl_object.h 
b/drivers/staging/lustre/lustre/include/cl_object.h
index 918be65..587a236 100644
--- a/drivers/staging/lustre/lustre/include/cl_object.h
+++ b/drivers/staging/lustre/lustre/include/cl_object.h
@@ -2351,6 +2351,16 @@ struct cl_client_cache {
 * Lock to protect ccc_lru list
 */
spinlock_t  ccc_lru_lock;
+   /**
+* # of unstable pages for this mount point
+*/
+   atomic_tccc_unstable_nr;
+   /**
+* Waitq for awaiting unstable pages to reach zero.
+* Used at umounting time and signaled on BRW commit
+*/
+wait_queue_head_t  ccc_unstable_waitq;
+
 };
 
 /** @} cl_page */
diff --git a/drivers/staging/lustre/lustre/include/lustre_net.h 
b/drivers/staging/lustre/lustre/include/lustre_net.h
index 69586a5..a7973d5 100644
--- a/drivers/staging/lustre/lustre/include/lustre_net.h
+++ b/drivers/staging/lustre/lustre/include/lustre_net.h
@@ -1327,7 +1327,9 @@ struct ptlrpc_request {
/* allow the req to be sent if the import is in recovery
 * status
 */
-   rq_allow_replay:1;
+   rq_allow_replay:1,
+   /* bulk request, sent to server, but uncommitted */
+   rq_unstable:1;
 
unsigned int rq_nr_resend;
 
diff --git a/drivers/staging/lustre/lustre/include/obd.h 
b/drivers/staging/lustre/lustre/include/obd.h
index e97e25b..3f24a5b 100644
--- a/drivers/staging/lustre/lustre/include/obd.h
+++ b/drivers/staging/lustre/lustre/include/obd.h
@@ -477,7 +477,7 @@ struct lov_obd {
struct dentry   *lov_pool_debugfs_entry;
enum lustre_sec_partlov_sp_me;
 
-   /* Cached LRU pages from upper layer */
+   /* Cached LRU and unstable data from upper layer */
void   *lov_cache;
 
struct rw_semaphore lov_notify_lock;
diff --git a/drivers/staging/lustre/lustre/include/obd_support.h 
b/drivers/staging/lustre/lustre/include/obd_support.h
index f8ee3a3..c7267b7 100644
--- 

[PATCH 12/15] staging: lustre: osc: Track and limit "unstable" pages

2016-04-27 Thread James Simmons
From: Prakash Surya 

This change adds a global counter to track the number of "unstable"
pages held by a given client, along with per file system counters. An
"unstable" page is defined as a page which has been sent to the server
as part of a bulk request, but is uncommitted to stable storage.

In addition to simply tracking the unstable pages, they now also count
towards the maximum number of "pinned" pages on the system at any given
time. Thus, a client will now be bound on the number of dirty and
unstable pages it can pin in memory. Previously only dirty pages were
accounted for in this limit.

In addition to tracking the number of unstable pages in Lustre, the
NR_UNSTABLE_NFS memory zone is also incremented and decremented for
easy monitoring using the "NFS_Unstable:" field in /proc/meminfo.
This field is also used internally by the kernel to limit the total
amount of unstable pages on the system.

The motivation for this change is twofold. First, the client must not
allow itself to disconnect from an OST while still holding unstable
pages. Otherwise, these unstable pages can get lost due to an OST
failure, and replay is not possible due to the disconnect via unmount.

Secondly, the client needs a mechanism to prevent it from allocating too
much of its available RAM to unreclaimable pages pinned by the ptlrpc
layer. If this case occurs, out of memory events can trigger as a side
effect, which we need to avoid.

The current number of unstable pages accounted for on a per file system
granularity is exported by the unstable_stats proc file, contained under
each file system's llite namespace. An example of retrieving this
information is below:

$ lctl get_param llite.*.unstable_stats

Signed-off-by: Prakash Surya 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-2139
Reviewed-on: http://review.whamcloud.com/6284
Reviewed-by: Jinshan Xiong 
Reviewed-by: Andreas Dilger 
Reviewed-by: Oleg Drokin 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/include/cl_object.h  |   10 ++
 drivers/staging/lustre/lustre/include/lustre_net.h |4 +-
 drivers/staging/lustre/lustre/include/obd.h|2 +-
 .../staging/lustre/lustre/include/obd_support.h|1 +
 .../staging/lustre/lustre/llite/llite_internal.h   |6 +
 drivers/staging/lustre/lustre/llite/llite_lib.c|   20 -
 drivers/staging/lustre/lustre/llite/lproc_llite.c  |   18 
 drivers/staging/lustre/lustre/obdclass/class_obd.c |2 +
 drivers/staging/lustre/lustre/osc/osc_cache.c  |   99 +++-
 drivers/staging/lustre/lustre/osc/osc_internal.h   |3 +
 drivers/staging/lustre/lustre/osc/osc_request.c|   28 +-
 11 files changed, 182 insertions(+), 11 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/cl_object.h 
b/drivers/staging/lustre/lustre/include/cl_object.h
index 918be65..587a236 100644
--- a/drivers/staging/lustre/lustre/include/cl_object.h
+++ b/drivers/staging/lustre/lustre/include/cl_object.h
@@ -2351,6 +2351,16 @@ struct cl_client_cache {
 * Lock to protect ccc_lru list
 */
spinlock_t  ccc_lru_lock;
+   /**
+* # of unstable pages for this mount point
+*/
+   atomic_tccc_unstable_nr;
+   /**
+* Waitq for awaiting unstable pages to reach zero.
+* Used at umounting time and signaled on BRW commit
+*/
+wait_queue_head_t  ccc_unstable_waitq;
+
 };
 
 /** @} cl_page */
diff --git a/drivers/staging/lustre/lustre/include/lustre_net.h 
b/drivers/staging/lustre/lustre/include/lustre_net.h
index 69586a5..a7973d5 100644
--- a/drivers/staging/lustre/lustre/include/lustre_net.h
+++ b/drivers/staging/lustre/lustre/include/lustre_net.h
@@ -1327,7 +1327,9 @@ struct ptlrpc_request {
/* allow the req to be sent if the import is in recovery
 * status
 */
-   rq_allow_replay:1;
+   rq_allow_replay:1,
+   /* bulk request, sent to server, but uncommitted */
+   rq_unstable:1;
 
unsigned int rq_nr_resend;
 
diff --git a/drivers/staging/lustre/lustre/include/obd.h 
b/drivers/staging/lustre/lustre/include/obd.h
index e97e25b..3f24a5b 100644
--- a/drivers/staging/lustre/lustre/include/obd.h
+++ b/drivers/staging/lustre/lustre/include/obd.h
@@ -477,7 +477,7 @@ struct lov_obd {
struct dentry   *lov_pool_debugfs_entry;
enum lustre_sec_partlov_sp_me;
 
-   /* Cached LRU pages from upper layer */
+   /* Cached LRU and unstable data from upper layer */
void   *lov_cache;
 
struct rw_semaphore lov_notify_lock;
diff --git a/drivers/staging/lustre/lustre/include/obd_support.h 
b/drivers/staging/lustre/lustre/include/obd_support.h
index f8ee3a3..c7267b7 100644
--- a/drivers/staging/lustre/lustre/include/obd_support.h
+++ b/drivers/staging/lustre/lustre/include/obd_support.h
@@ -58,6 +58,7 @@ extern int 

[PATCH 08/15] staging: lustre: ldlm: update comments about ldlm l_flags

2016-04-27 Thread James Simmons
From: Bruce Korb 

Add and update documentation about some of the ldlm l_flags.

Signed-off-by: Bruce Korb 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-2906
Reviewed-by: Keith Mannthey 
Reviewed-on: http://review.whamcloud.com/7963
Reviewed-by: Doug Oucharek 
Reviewed-by: Andreas Dilger 
Signed-off-by: James Simmons 
---
 .../lustre/lustre/include/lustre_dlm_flags.h   |   14 +++---
 1 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h 
b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
index 1eb0cc4..e7e0c21 100644
--- a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
+++ b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
@@ -198,7 +198,7 @@
 #define ldlm_set_cancel(_l) LDLM_SET_FLAG((_l), 1ULL << 36)
 #define ldlm_clear_cancel(_l)   LDLM_CLEAR_FLAG((_l), 1ULL << 36)
 
-/** whatever it might mean */
+/** whatever it might mean -- never transmitted? */
 #define LDLM_FL_LOCAL_ONLY  0x0020ULL /* bit 37 */
 #define ldlm_is_local_only(_l)  LDLM_TEST_FLAG((_l), 1ULL << 37)
 #define ldlm_set_local_only(_l) LDLM_SET_FLAG((_l), 1ULL << 37)
@@ -281,18 +281,18 @@
  * has canceled this lock and is waiting for rpc_lock which is taken by
  * the first operation. LDLM_FL_BL_AST is set by ldlm_callback_handler() in
  * the lock to prevent the Early Lock Cancel (ELC) code from cancelling it.
- *
- * LDLM_FL_BL_DONE is to be set by ldlm_cancel_callback() when lock cache is
- * dropped to let ldlm_callback_handler() return EINVAL to the server. It
- * is used when ELC RPC is already prepared and is waiting for rpc_lock,
- * too late to send a separate CANCEL RPC.
  */
 #define LDLM_FL_BL_AST  0x4000ULL /* bit 46 */
 #define ldlm_is_bl_ast(_l)  LDLM_TEST_FLAG((_l), 1ULL << 46)
 #define ldlm_set_bl_ast(_l) LDLM_SET_FLAG((_l), 1ULL << 46)
 #define ldlm_clear_bl_ast(_l)   LDLM_CLEAR_FLAG((_l), 1ULL << 46)
 
-/** whatever it might mean */
+/**
+ * Set by ldlm_cancel_callback() when lock cache is dropped to let
+ * ldlm_callback_handler() return EINVAL to the server. It is used when
+ * ELC RPC is already prepared and is waiting for rpc_lock, too late to
+ * send a separate CANCEL RPC.
+ */
 #define LDLM_FL_BL_DONE 0x8000ULL /* bit 47 */
 #define ldlm_is_bl_done(_l) LDLM_TEST_FLAG((_l), 1ULL << 47)
 #define ldlm_set_bl_done(_l)LDLM_SET_FLAG((_l), 1ULL << 47)
-- 
1.7.1



[PATCH 08/15] staging: lustre: ldlm: update comments about ldlm l_flags

2016-04-27 Thread James Simmons
From: Bruce Korb 

Add and update documentation about some of the ldlm l_flags.

Signed-off-by: Bruce Korb 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-2906
Reviewed-by: Keith Mannthey 
Reviewed-on: http://review.whamcloud.com/7963
Reviewed-by: Doug Oucharek 
Reviewed-by: Andreas Dilger 
Signed-off-by: James Simmons 
---
 .../lustre/lustre/include/lustre_dlm_flags.h   |   14 +++---
 1 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h 
b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
index 1eb0cc4..e7e0c21 100644
--- a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
+++ b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
@@ -198,7 +198,7 @@
 #define ldlm_set_cancel(_l) LDLM_SET_FLAG((_l), 1ULL << 36)
 #define ldlm_clear_cancel(_l)   LDLM_CLEAR_FLAG((_l), 1ULL << 36)
 
-/** whatever it might mean */
+/** whatever it might mean -- never transmitted? */
 #define LDLM_FL_LOCAL_ONLY  0x0020ULL /* bit 37 */
 #define ldlm_is_local_only(_l)  LDLM_TEST_FLAG((_l), 1ULL << 37)
 #define ldlm_set_local_only(_l) LDLM_SET_FLAG((_l), 1ULL << 37)
@@ -281,18 +281,18 @@
  * has canceled this lock and is waiting for rpc_lock which is taken by
  * the first operation. LDLM_FL_BL_AST is set by ldlm_callback_handler() in
  * the lock to prevent the Early Lock Cancel (ELC) code from cancelling it.
- *
- * LDLM_FL_BL_DONE is to be set by ldlm_cancel_callback() when lock cache is
- * dropped to let ldlm_callback_handler() return EINVAL to the server. It
- * is used when ELC RPC is already prepared and is waiting for rpc_lock,
- * too late to send a separate CANCEL RPC.
  */
 #define LDLM_FL_BL_AST  0x4000ULL /* bit 46 */
 #define ldlm_is_bl_ast(_l)  LDLM_TEST_FLAG((_l), 1ULL << 46)
 #define ldlm_set_bl_ast(_l) LDLM_SET_FLAG((_l), 1ULL << 46)
 #define ldlm_clear_bl_ast(_l)   LDLM_CLEAR_FLAG((_l), 1ULL << 46)
 
-/** whatever it might mean */
+/**
+ * Set by ldlm_cancel_callback() when lock cache is dropped to let
+ * ldlm_callback_handler() return EINVAL to the server. It is used when
+ * ELC RPC is already prepared and is waiting for rpc_lock, too late to
+ * send a separate CANCEL RPC.
+ */
 #define LDLM_FL_BL_DONE 0x8000ULL /* bit 47 */
 #define ldlm_is_bl_done(_l) LDLM_TEST_FLAG((_l), 1ULL << 47)
 #define ldlm_set_bl_done(_l)LDLM_SET_FLAG((_l), 1ULL << 47)
-- 
1.7.1



[PATCH 02/15] staging: lustre: llite: NFS reexport issue

2016-04-27 Thread James Simmons
From: Dmitry Eremin 

Suppress erroneous/confusing messages when NFS
is out of sync and requests old data.

Signed-off-by: Dmitry Eremin 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-4050
Reviewed-on: http://review.whamcloud.com/7850
Reviewed-by: Andreas Dilger 
Reviewed-by: Bob Glossman 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/llite/llite_nfs.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c 
b/drivers/staging/lustre/lustre/llite/llite_nfs.c
index 193aab8..2c26815 100644
--- a/drivers/staging/lustre/lustre/llite/llite_nfs.c
+++ b/drivers/staging/lustre/lustre/llite/llite_nfs.c
@@ -119,7 +119,7 @@ struct inode *search_inode_for_lustre(struct super_block 
*sb,
rc = md_getattr(sbi->ll_md_exp, op_data, );
kfree(op_data);
if (rc) {
-   CERROR("can't get object attrs, fid "DFID", rc %d\n",
+   CDEBUG(D_INFO, "can't get object attrs, fid "DFID", rc %d\n",
   PFID(fid), rc);
return ERR_PTR(rc);
}
-- 
1.7.1



[PATCH 02/15] staging: lustre: llite: NFS reexport issue

2016-04-27 Thread James Simmons
From: Dmitry Eremin 

Suppress erroneous/confusing messages when NFS
is out of sync and requests old data.

Signed-off-by: Dmitry Eremin 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-4050
Reviewed-on: http://review.whamcloud.com/7850
Reviewed-by: Andreas Dilger 
Reviewed-by: Bob Glossman 
Signed-off-by: James Simmons 
---
 drivers/staging/lustre/lustre/llite/llite_nfs.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c 
b/drivers/staging/lustre/lustre/llite/llite_nfs.c
index 193aab8..2c26815 100644
--- a/drivers/staging/lustre/lustre/llite/llite_nfs.c
+++ b/drivers/staging/lustre/lustre/llite/llite_nfs.c
@@ -119,7 +119,7 @@ struct inode *search_inode_for_lustre(struct super_block 
*sb,
rc = md_getattr(sbi->ll_md_exp, op_data, );
kfree(op_data);
if (rc) {
-   CERROR("can't get object attrs, fid "DFID", rc %d\n",
+   CDEBUG(D_INFO, "can't get object attrs, fid "DFID", rc %d\n",
   PFID(fid), rc);
return ERR_PTR(rc);
}
-- 
1.7.1



[PATCH 06/15] staging: lustre: ldlm: clean up l_flags

2016-04-27 Thread James Simmons
From: Bruce Korb 

Remove the now obsolete LDLM_AST_FLAGS and LDLM_INHERIT_FLAGS defines.
Remove the obsolete LDLM_FL_HIDE_LOCK_MASK define.
Rename "local_only" mask to "off_wire" since it is confusingly similar
to a flag that (I think) means, "do not copy this lock over the wire."

Signed-off-by: Bruce Korb 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-2906
Reviewed-by: Keith Mannthey 
Reviewed-on: http://review.whamcloud.com/7963
Reviewed-by: Doug Oucharek 
Reviewed-by: Andreas Dilger 
Signed-off-by: James Simmons 
---
 .../lustre/lustre/include/lustre_dlm_flags.h   |   16 ++--
 drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c|2 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_request.c  |2 +-
 3 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h 
b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
index aff0904..62d3b31 100644
--- a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
+++ b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
@@ -37,17 +37,11 @@
 /** l_flags bits marked as "gone" bits */
 #define LDLM_FL_GONE_MASK   0x00060040ULL
 
-/** l_flags bits marked as "hide_lock" bits */
-#define LDLM_FL_HIDE_LOCK_MASK  0x2064ULL
-
 /** l_flags bits marked as "inherit" bits */
 #define LDLM_FL_INHERIT_MASK0x0080ULL
 
-/** l_flags bits marked as "local_only" bits */
-#define LDLM_FL_LOCAL_ONLY_MASK 0x00FFULL
-
-/** l_flags bits marked as "on_wire" bits */
-#define LDLM_FL_ON_WIRE_MASK0xC08F932FULL
+/** l_flags bits marked as "off_wire" bits */
+#define LDLM_FL_OFF_WIRE_MASK   0x00FFULL
 
 /** extent, mode, or resource changed */
 #define LDLM_FL_LOCK_CHANGED0x0001ULL /* bit 0 */
@@ -390,12 +384,6 @@
 /** clear a ldlm_lock flag bit */
 #define LDLM_CLEAR_FLAG(_l, _b)   ((_l)->l_flags &= ~(_b))
 
-/** Mask of flags inherited from parent lock when doing intents. */
-#define LDLM_INHERIT_FLAGSLDLM_FL_INHERIT_MASK
-
-/** Mask of Flags sent in AST lock_flags to map into the receiving lock. */
-#define LDLM_AST_FLAGSLDLM_FL_AST_MASK
-
 /** @} subgroup */
 /** @} group */
 #ifdef WIRESHARK_COMPILE
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c 
b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
index 024185b..ab739f0 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
@@ -632,7 +632,7 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
/* Copy hints/flags (e.g. LDLM_FL_DISCARD_DATA) from AST. */
lock_res_and_lock(lock);
lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags &
- LDLM_AST_FLAGS);
+ LDLM_FL_AST_MASK);
if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
/* If somebody cancels lock and cache is already dropped,
 * or lock is failed before cp_ast received on client,
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_request.c 
b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
index 0e4ab2c..107314e 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
@@ -421,7 +421,7 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct 
ptlrpc_request *req,
 
*flags = ldlm_flags_from_wire(reply->lock_flags);
lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
- LDLM_INHERIT_FLAGS);
+ LDLM_FL_INHERIT_MASK);
/* move NO_TIMEOUT flag to the lock to force ldlm_lock_match()
 * to wait with no timeout as well
 */
-- 
1.7.1



[PATCH 00/15] patches missing from lustre 2.5.51

2016-04-27 Thread James Simmons
This is the collection of bug fixes and code cleanup that are
missing from the upstream client that exist with lustre
version 2.5.51.

Andreas Dilger (1):
  staging: lustre: ptlrpc: quiet warning for 2.1/2.5 connections

Bruce Korb (4):
  staging: lustre: ldlm: use accessor macros for l_flags
  staging: lustre: ldlm: clean up l_flags
  staging: lustre: ldlm: remove code wireshark handling
  staging: lustre: ldlm: update comments about ldlm l_flags

Dmitry Eremin (2):
  staging: lustre: llite: NFS reexport issue
  staging: lustre: lmv: kernel crash due to misconfigured MDT

James Nunez (1):
  staging: lustre: llite: Replace printing of i_ino with ll_inode2fid()

Jinshan Xiong (1):
  staging: lustre: llite: reset writeback index in ll_writepages

Niu Yawei (1):
  staging: lustre: clio: add debug message in osc_completion()

Prakash Surya (3):
  staging: lustre: osc: Track and limit "unstable" pages
  staging: lustre: osc: Track number of "unstable" pages per osc
  staging: lustre: osc: Use SOFT_SYNC to urge server commit

Sebastien Buisson (1):
  staging: lustre: mgc: fix 'error handling' issues

Wang Di (1):
  staging: lustre: obdclass: add LCT_SERVER_SESSION for server session

 drivers/staging/lustre/lustre/include/cl_object.h  |   10 ++
 drivers/staging/lustre/lustre/include/lu_object.h  |4 +
 .../lustre/lustre/include/lustre/lustre_idl.h  |5 +
 .../lustre/lustre/include/lustre_dlm_flags.h   |  120 ++--
 drivers/staging/lustre/lustre/include/lustre_net.h |4 +-
 drivers/staging/lustre/lustre/include/obd.h|3 +-
 drivers/staging/lustre/lustre/include/obd_class.h  |2 +-
 .../staging/lustre/lustre/include/obd_support.h|1 +
 drivers/staging/lustre/lustre/ldlm/l_lock.c|4 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_extent.c   |4 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_flock.c|   11 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_internal.h |7 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_lib.c  |1 +
 drivers/staging/lustre/lustre/ldlm/ldlm_lock.c |   96 ++---
 drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c|   28 ++--
 drivers/staging/lustre/lustre/ldlm/ldlm_request.c  |   34 ++---
 drivers/staging/lustre/lustre/ldlm/ldlm_resource.c |   12 +-
 drivers/staging/lustre/lustre/llite/dcache.c   |   15 +--
 drivers/staging/lustre/lustre/llite/dir.c  |   23 ++--
 drivers/staging/lustre/lustre/llite/file.c |   76 +-
 drivers/staging/lustre/lustre/llite/llite_close.c  |   40 +++---
 .../staging/lustre/lustre/llite/llite_internal.h   |   16 ++-
 drivers/staging/lustre/lustre/llite/llite_lib.c|   60 +---
 drivers/staging/lustre/lustre/llite/llite_mmap.c   |6 +-
 drivers/staging/lustre/lustre/llite/llite_nfs.c|   16 ++-
 drivers/staging/lustre/lustre/llite/lproc_llite.c  |   18 +++
 drivers/staging/lustre/lustre/llite/namei.c|   78 +--
 drivers/staging/lustre/lustre/llite/rw.c   |5 +-
 drivers/staging/lustre/lustre/llite/rw26.c |5 +-
 drivers/staging/lustre/lustre/llite/statahead.c|   17 +--
 drivers/staging/lustre/lustre/llite/symlink.c  |   10 +-
 drivers/staging/lustre/lustre/llite/vvp_dev.c  |5 +-
 drivers/staging/lustre/lustre/llite/xattr.c|   20 ++--
 drivers/staging/lustre/lustre/lmv/lmv_obd.c|  151 
 drivers/staging/lustre/lustre/mgc/mgc_request.c|6 +-
 drivers/staging/lustre/lustre/obdclass/class_obd.c |2 +
 drivers/staging/lustre/lustre/osc/lproc_osc.c  |   18 +++
 drivers/staging/lustre/lustre/osc/osc_cache.c  |  142 --
 drivers/staging/lustre/lustre/osc/osc_internal.h   |4 +
 drivers/staging/lustre/lustre/osc/osc_lock.c   |2 +-
 drivers/staging/lustre/lustre/osc/osc_page.c   |   29 
 drivers/staging/lustre/lustre/osc/osc_request.c|   31 -
 drivers/staging/lustre/lustre/ptlrpc/import.c  |   11 +-
 drivers/staging/lustre/lustre/ptlrpc/service.c |3 +-
 drivers/staging/lustre/lustre/ptlrpc/wiretest.c|6 +
 45 files changed, 682 insertions(+), 479 deletions(-)



[PATCH 06/15] staging: lustre: ldlm: clean up l_flags

2016-04-27 Thread James Simmons
From: Bruce Korb 

Remove the now obsolete LDLM_AST_FLAGS and LDLM_INHERIT_FLAGS defines.
Remove the obsolete LDLM_FL_HIDE_LOCK_MASK define.
Rename "local_only" mask to "off_wire" since it is confusingly similar
to a flag that (I think) means, "do not copy this lock over the wire."

Signed-off-by: Bruce Korb 
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-2906
Reviewed-by: Keith Mannthey 
Reviewed-on: http://review.whamcloud.com/7963
Reviewed-by: Doug Oucharek 
Reviewed-by: Andreas Dilger 
Signed-off-by: James Simmons 
---
 .../lustre/lustre/include/lustre_dlm_flags.h   |   16 ++--
 drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c|2 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_request.c  |2 +-
 3 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h 
b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
index aff0904..62d3b31 100644
--- a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
+++ b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
@@ -37,17 +37,11 @@
 /** l_flags bits marked as "gone" bits */
 #define LDLM_FL_GONE_MASK   0x00060040ULL
 
-/** l_flags bits marked as "hide_lock" bits */
-#define LDLM_FL_HIDE_LOCK_MASK  0x2064ULL
-
 /** l_flags bits marked as "inherit" bits */
 #define LDLM_FL_INHERIT_MASK0x0080ULL
 
-/** l_flags bits marked as "local_only" bits */
-#define LDLM_FL_LOCAL_ONLY_MASK 0x00FFULL
-
-/** l_flags bits marked as "on_wire" bits */
-#define LDLM_FL_ON_WIRE_MASK0xC08F932FULL
+/** l_flags bits marked as "off_wire" bits */
+#define LDLM_FL_OFF_WIRE_MASK   0x00FFULL
 
 /** extent, mode, or resource changed */
 #define LDLM_FL_LOCK_CHANGED0x0001ULL /* bit 0 */
@@ -390,12 +384,6 @@
 /** clear a ldlm_lock flag bit */
 #define LDLM_CLEAR_FLAG(_l, _b)   ((_l)->l_flags &= ~(_b))
 
-/** Mask of flags inherited from parent lock when doing intents. */
-#define LDLM_INHERIT_FLAGSLDLM_FL_INHERIT_MASK
-
-/** Mask of Flags sent in AST lock_flags to map into the receiving lock. */
-#define LDLM_AST_FLAGSLDLM_FL_AST_MASK
-
 /** @} subgroup */
 /** @} group */
 #ifdef WIRESHARK_COMPILE
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c 
b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
index 024185b..ab739f0 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
@@ -632,7 +632,7 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
/* Copy hints/flags (e.g. LDLM_FL_DISCARD_DATA) from AST. */
lock_res_and_lock(lock);
lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags &
- LDLM_AST_FLAGS);
+ LDLM_FL_AST_MASK);
if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
/* If somebody cancels lock and cache is already dropped,
 * or lock is failed before cp_ast received on client,
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_request.c 
b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
index 0e4ab2c..107314e 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
@@ -421,7 +421,7 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct 
ptlrpc_request *req,
 
*flags = ldlm_flags_from_wire(reply->lock_flags);
lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
- LDLM_INHERIT_FLAGS);
+ LDLM_FL_INHERIT_MASK);
/* move NO_TIMEOUT flag to the lock to force ldlm_lock_match()
 * to wait with no timeout as well
 */
-- 
1.7.1



[PATCH 00/15] patches missing from lustre 2.5.51

2016-04-27 Thread James Simmons
This is the collection of bug fixes and code cleanup that are
missing from the upstream client that exist with lustre
version 2.5.51.

Andreas Dilger (1):
  staging: lustre: ptlrpc: quiet warning for 2.1/2.5 connections

Bruce Korb (4):
  staging: lustre: ldlm: use accessor macros for l_flags
  staging: lustre: ldlm: clean up l_flags
  staging: lustre: ldlm: remove code wireshark handling
  staging: lustre: ldlm: update comments about ldlm l_flags

Dmitry Eremin (2):
  staging: lustre: llite: NFS reexport issue
  staging: lustre: lmv: kernel crash due to misconfigured MDT

James Nunez (1):
  staging: lustre: llite: Replace printing of i_ino with ll_inode2fid()

Jinshan Xiong (1):
  staging: lustre: llite: reset writeback index in ll_writepages

Niu Yawei (1):
  staging: lustre: clio: add debug message in osc_completion()

Prakash Surya (3):
  staging: lustre: osc: Track and limit "unstable" pages
  staging: lustre: osc: Track number of "unstable" pages per osc
  staging: lustre: osc: Use SOFT_SYNC to urge server commit

Sebastien Buisson (1):
  staging: lustre: mgc: fix 'error handling' issues

Wang Di (1):
  staging: lustre: obdclass: add LCT_SERVER_SESSION for server session

 drivers/staging/lustre/lustre/include/cl_object.h  |   10 ++
 drivers/staging/lustre/lustre/include/lu_object.h  |4 +
 .../lustre/lustre/include/lustre/lustre_idl.h  |5 +
 .../lustre/lustre/include/lustre_dlm_flags.h   |  120 ++--
 drivers/staging/lustre/lustre/include/lustre_net.h |4 +-
 drivers/staging/lustre/lustre/include/obd.h|3 +-
 drivers/staging/lustre/lustre/include/obd_class.h  |2 +-
 .../staging/lustre/lustre/include/obd_support.h|1 +
 drivers/staging/lustre/lustre/ldlm/l_lock.c|4 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_extent.c   |4 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_flock.c|   11 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_internal.h |7 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_lib.c  |1 +
 drivers/staging/lustre/lustre/ldlm/ldlm_lock.c |   96 ++---
 drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c|   28 ++--
 drivers/staging/lustre/lustre/ldlm/ldlm_request.c  |   34 ++---
 drivers/staging/lustre/lustre/ldlm/ldlm_resource.c |   12 +-
 drivers/staging/lustre/lustre/llite/dcache.c   |   15 +--
 drivers/staging/lustre/lustre/llite/dir.c  |   23 ++--
 drivers/staging/lustre/lustre/llite/file.c |   76 +-
 drivers/staging/lustre/lustre/llite/llite_close.c  |   40 +++---
 .../staging/lustre/lustre/llite/llite_internal.h   |   16 ++-
 drivers/staging/lustre/lustre/llite/llite_lib.c|   60 +---
 drivers/staging/lustre/lustre/llite/llite_mmap.c   |6 +-
 drivers/staging/lustre/lustre/llite/llite_nfs.c|   16 ++-
 drivers/staging/lustre/lustre/llite/lproc_llite.c  |   18 +++
 drivers/staging/lustre/lustre/llite/namei.c|   78 +--
 drivers/staging/lustre/lustre/llite/rw.c   |5 +-
 drivers/staging/lustre/lustre/llite/rw26.c |5 +-
 drivers/staging/lustre/lustre/llite/statahead.c|   17 +--
 drivers/staging/lustre/lustre/llite/symlink.c  |   10 +-
 drivers/staging/lustre/lustre/llite/vvp_dev.c  |5 +-
 drivers/staging/lustre/lustre/llite/xattr.c|   20 ++--
 drivers/staging/lustre/lustre/lmv/lmv_obd.c|  151 
 drivers/staging/lustre/lustre/mgc/mgc_request.c|6 +-
 drivers/staging/lustre/lustre/obdclass/class_obd.c |2 +
 drivers/staging/lustre/lustre/osc/lproc_osc.c  |   18 +++
 drivers/staging/lustre/lustre/osc/osc_cache.c  |  142 --
 drivers/staging/lustre/lustre/osc/osc_internal.h   |4 +
 drivers/staging/lustre/lustre/osc/osc_lock.c   |2 +-
 drivers/staging/lustre/lustre/osc/osc_page.c   |   29 
 drivers/staging/lustre/lustre/osc/osc_request.c|   31 -
 drivers/staging/lustre/lustre/ptlrpc/import.c  |   11 +-
 drivers/staging/lustre/lustre/ptlrpc/service.c |3 +-
 drivers/staging/lustre/lustre/ptlrpc/wiretest.c|6 +
 45 files changed, 682 insertions(+), 479 deletions(-)



Re: [PATCH, RFT] byteswap: try to avoid __builtin_constant_p gcc bug

2016-04-27 Thread Josh Poimboeuf
On Thu, Apr 28, 2016 at 12:00:36AM +0200, Arnd Bergmann wrote:
> This is another attempt to avoid a regression in wwn_to_u64()
> after that started using get_unaligned_be64(), which in turn
> ran into a bug on gcc-4.9 through 6.1.
> 
> As part of the problem is how __builtin_constant_p gets evaluated
> on an argument passed by reference into an inline function, this
> avoids the use of __builtin_constant_p() for all architectures
> that set CONFIG_ARCH_USE_BUILTIN_BSWAP. Most architectures do not
> set ARCH_SUPPORTS_OPTIMIZED_INLINING, which means they probably
> do not suffer from the problem in the qla2xxx driver, but they
> might still run into it elsewhere.
> 
> I have not been able to reproduce the original problem, so I don't
> know if this patch solves it, but at least it leads to simpler
> code doing the same thing, so at least there should be no downsides.
> 
> Please test.
> 
> Signed-off-by: Arnd Bergmann 

Nice patch.  I can confirm it fixes the issue with gcc 5.3.1.

Tested-by: Josh Poimboeuf 
Reviewed-by: Josh Poimboeuf 

> diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h
> index 3f10e5317b46..de56fd54428d 100644
> --- a/include/uapi/linux/swab.h
> +++ b/include/uapi/linux/swab.h
> @@ -45,9 +45,7 @@
>  
>  static inline __attribute_const__ __u16 __fswab16(__u16 val)
>  {
> -#ifdef __HAVE_BUILTIN_BSWAP16__
> - return __builtin_bswap16(val);
> -#elif defined (__arch_swab16)
> +#if defined (__arch_swab16)
>   return __arch_swab16(val);
>  #else
>   return ___constant_swab16(val);
> @@ -56,9 +54,7 @@ static inline __attribute_const__ __u16 __fswab16(__u16 val)
>  
>  static inline __attribute_const__ __u32 __fswab32(__u32 val)
>  {
> -#ifdef __HAVE_BUILTIN_BSWAP32__
> - return __builtin_bswap32(val);
> -#elif defined(__arch_swab32)
> +#if defined(__arch_swab32)
>   return __arch_swab32(val);
>  #else
>   return ___constant_swab32(val);
> @@ -67,9 +63,7 @@ static inline __attribute_const__ __u32 __fswab32(__u32 val)
>  
>  static inline __attribute_const__ __u64 __fswab64(__u64 val)
>  {
> -#ifdef __HAVE_BUILTIN_BSWAP64__
> - return __builtin_bswap64(val);
> -#elif defined (__arch_swab64)
> +#if defined (__arch_swab64)
>   return __arch_swab64(val);
>  #elif defined(__SWAB_64_THRU_32__)
>   __u32 h = val >> 32;
> @@ -102,28 +96,40 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 
> val)
>   * __swab16 - return a byteswapped 16-bit value
>   * @x: value to byteswap
>   */
> +#ifdef __HAVE_BUILTIN_BSWAP16__
> +#define __swab16(x) __builtin_bswap16((__u16)(x))
> +#else
>  #define __swab16(x)  \
>   (__builtin_constant_p((__u16)(x)) ? \
>   ___constant_swab16(x) : \
>   __fswab16(x))
> +#endif
>  
>  /**
>   * __swab32 - return a byteswapped 32-bit value
>   * @x: value to byteswap
>   */
> +#ifdef __HAVE_BUILTIN_BSWAP32__
> +#define __swab32(x) __builtin_bswap32((__u32)(x))
> +#else
>  #define __swab32(x)  \
>   (__builtin_constant_p((__u32)(x)) ? \
>   ___constant_swab32(x) : \
>   __fswab32(x))
> +#endif
>  
>  /**
>   * __swab64 - return a byteswapped 64-bit value
>   * @x: value to byteswap
>   */
> +#ifdef __HAVE_BUILTIN_BSWAP64__
> +#define __swab64(x) __builtin_bswap64((__u64)(x))
> +#else
>  #define __swab64(x)  \
>   (__builtin_constant_p((__u64)(x)) ? \
>   ___constant_swab64(x) : \
>   __fswab64(x))
> +#endif
>  
>  /**
>   * __swahw32 - return a word-swapped 32-bit value
> 

-- 
Josh


Re: [PATCH, RFT] byteswap: try to avoid __builtin_constant_p gcc bug

2016-04-27 Thread Josh Poimboeuf
On Thu, Apr 28, 2016 at 12:00:36AM +0200, Arnd Bergmann wrote:
> This is another attempt to avoid a regression in wwn_to_u64()
> after that started using get_unaligned_be64(), which in turn
> ran into a bug on gcc-4.9 through 6.1.
> 
> As part of the problem is how __builtin_constant_p gets evaluated
> on an argument passed by reference into an inline function, this
> avoids the use of __builtin_constant_p() for all architectures
> that set CONFIG_ARCH_USE_BUILTIN_BSWAP. Most architectures do not
> set ARCH_SUPPORTS_OPTIMIZED_INLINING, which means they probably
> do not suffer from the problem in the qla2xxx driver, but they
> might still run into it elsewhere.
> 
> I have not been able to reproduce the original problem, so I don't
> know if this patch solves it, but at least it leads to simpler
> code doing the same thing, so at least there should be no downsides.
> 
> Please test.
> 
> Signed-off-by: Arnd Bergmann 

Nice patch.  I can confirm it fixes the issue with gcc 5.3.1.

Tested-by: Josh Poimboeuf 
Reviewed-by: Josh Poimboeuf 

> diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h
> index 3f10e5317b46..de56fd54428d 100644
> --- a/include/uapi/linux/swab.h
> +++ b/include/uapi/linux/swab.h
> @@ -45,9 +45,7 @@
>  
>  static inline __attribute_const__ __u16 __fswab16(__u16 val)
>  {
> -#ifdef __HAVE_BUILTIN_BSWAP16__
> - return __builtin_bswap16(val);
> -#elif defined (__arch_swab16)
> +#if defined (__arch_swab16)
>   return __arch_swab16(val);
>  #else
>   return ___constant_swab16(val);
> @@ -56,9 +54,7 @@ static inline __attribute_const__ __u16 __fswab16(__u16 val)
>  
>  static inline __attribute_const__ __u32 __fswab32(__u32 val)
>  {
> -#ifdef __HAVE_BUILTIN_BSWAP32__
> - return __builtin_bswap32(val);
> -#elif defined(__arch_swab32)
> +#if defined(__arch_swab32)
>   return __arch_swab32(val);
>  #else
>   return ___constant_swab32(val);
> @@ -67,9 +63,7 @@ static inline __attribute_const__ __u32 __fswab32(__u32 val)
>  
>  static inline __attribute_const__ __u64 __fswab64(__u64 val)
>  {
> -#ifdef __HAVE_BUILTIN_BSWAP64__
> - return __builtin_bswap64(val);
> -#elif defined (__arch_swab64)
> +#if defined (__arch_swab64)
>   return __arch_swab64(val);
>  #elif defined(__SWAB_64_THRU_32__)
>   __u32 h = val >> 32;
> @@ -102,28 +96,40 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 
> val)
>   * __swab16 - return a byteswapped 16-bit value
>   * @x: value to byteswap
>   */
> +#ifdef __HAVE_BUILTIN_BSWAP16__
> +#define __swab16(x) __builtin_bswap16((__u16)(x))
> +#else
>  #define __swab16(x)  \
>   (__builtin_constant_p((__u16)(x)) ? \
>   ___constant_swab16(x) : \
>   __fswab16(x))
> +#endif
>  
>  /**
>   * __swab32 - return a byteswapped 32-bit value
>   * @x: value to byteswap
>   */
> +#ifdef __HAVE_BUILTIN_BSWAP32__
> +#define __swab32(x) __builtin_bswap32((__u32)(x))
> +#else
>  #define __swab32(x)  \
>   (__builtin_constant_p((__u32)(x)) ? \
>   ___constant_swab32(x) : \
>   __fswab32(x))
> +#endif
>  
>  /**
>   * __swab64 - return a byteswapped 64-bit value
>   * @x: value to byteswap
>   */
> +#ifdef __HAVE_BUILTIN_BSWAP64__
> +#define __swab64(x) __builtin_bswap64((__u64)(x))
> +#else
>  #define __swab64(x)  \
>   (__builtin_constant_p((__u64)(x)) ? \
>   ___constant_swab64(x) : \
>   __fswab64(x))
> +#endif
>  
>  /**
>   * __swahw32 - return a word-swapped 32-bit value
> 

-- 
Josh


[PATCH, RFT] byteswap: try to avoid __builtin_constant_p gcc bug

2016-04-27 Thread Arnd Bergmann
This is another attempt to avoid a regression in wwn_to_u64()
after that started using get_unaligned_be64(), which in turn
ran into a bug on gcc-4.9 through 6.1.

As part of the problem is how __builtin_constant_p gets evaluated
on an argument passed by reference into an inline function, this
avoids the use of __builtin_constant_p() for all architectures
that set CONFIG_ARCH_USE_BUILTIN_BSWAP. Most architectures do not
set ARCH_SUPPORTS_OPTIMIZED_INLINING, which means they probably
do not suffer from the problem in the qla2xxx driver, but they
might still run into it elsewhere.

I have not been able to reproduce the original problem, so I don't
know if this patch solves it, but at least it leads to simpler
code doing the same thing, so at least there should be no downsides.

Please test.

Signed-off-by: Arnd Bergmann 

diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h
index 3f10e5317b46..de56fd54428d 100644
--- a/include/uapi/linux/swab.h
+++ b/include/uapi/linux/swab.h
@@ -45,9 +45,7 @@
 
 static inline __attribute_const__ __u16 __fswab16(__u16 val)
 {
-#ifdef __HAVE_BUILTIN_BSWAP16__
-   return __builtin_bswap16(val);
-#elif defined (__arch_swab16)
+#if defined (__arch_swab16)
return __arch_swab16(val);
 #else
return ___constant_swab16(val);
@@ -56,9 +54,7 @@ static inline __attribute_const__ __u16 __fswab16(__u16 val)
 
 static inline __attribute_const__ __u32 __fswab32(__u32 val)
 {
-#ifdef __HAVE_BUILTIN_BSWAP32__
-   return __builtin_bswap32(val);
-#elif defined(__arch_swab32)
+#if defined(__arch_swab32)
return __arch_swab32(val);
 #else
return ___constant_swab32(val);
@@ -67,9 +63,7 @@ static inline __attribute_const__ __u32 __fswab32(__u32 val)
 
 static inline __attribute_const__ __u64 __fswab64(__u64 val)
 {
-#ifdef __HAVE_BUILTIN_BSWAP64__
-   return __builtin_bswap64(val);
-#elif defined (__arch_swab64)
+#if defined (__arch_swab64)
return __arch_swab64(val);
 #elif defined(__SWAB_64_THRU_32__)
__u32 h = val >> 32;
@@ -102,28 +96,40 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 
val)
  * __swab16 - return a byteswapped 16-bit value
  * @x: value to byteswap
  */
+#ifdef __HAVE_BUILTIN_BSWAP16__
+#define __swab16(x) __builtin_bswap16((__u16)(x))
+#else
 #define __swab16(x)\
(__builtin_constant_p((__u16)(x)) ? \
___constant_swab16(x) : \
__fswab16(x))
+#endif
 
 /**
  * __swab32 - return a byteswapped 32-bit value
  * @x: value to byteswap
  */
+#ifdef __HAVE_BUILTIN_BSWAP32__
+#define __swab32(x) __builtin_bswap32((__u32)(x))
+#else
 #define __swab32(x)\
(__builtin_constant_p((__u32)(x)) ? \
___constant_swab32(x) : \
__fswab32(x))
+#endif
 
 /**
  * __swab64 - return a byteswapped 64-bit value
  * @x: value to byteswap
  */
+#ifdef __HAVE_BUILTIN_BSWAP64__
+#define __swab64(x) __builtin_bswap64((__u64)(x))
+#else
 #define __swab64(x)\
(__builtin_constant_p((__u64)(x)) ? \
___constant_swab64(x) : \
__fswab64(x))
+#endif
 
 /**
  * __swahw32 - return a word-swapped 32-bit value



[PATCH, RFT] byteswap: try to avoid __builtin_constant_p gcc bug

2016-04-27 Thread Arnd Bergmann
This is another attempt to avoid a regression in wwn_to_u64()
after that started using get_unaligned_be64(), which in turn
ran into a bug on gcc-4.9 through 6.1.

As part of the problem is how __builtin_constant_p gets evaluated
on an argument passed by reference into an inline function, this
avoids the use of __builtin_constant_p() for all architectures
that set CONFIG_ARCH_USE_BUILTIN_BSWAP. Most architectures do not
set ARCH_SUPPORTS_OPTIMIZED_INLINING, which means they probably
do not suffer from the problem in the qla2xxx driver, but they
might still run into it elsewhere.

I have not been able to reproduce the original problem, so I don't
know if this patch solves it, but at least it leads to simpler
code doing the same thing, so at least there should be no downsides.

Please test.

Signed-off-by: Arnd Bergmann 

diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h
index 3f10e5317b46..de56fd54428d 100644
--- a/include/uapi/linux/swab.h
+++ b/include/uapi/linux/swab.h
@@ -45,9 +45,7 @@
 
 static inline __attribute_const__ __u16 __fswab16(__u16 val)
 {
-#ifdef __HAVE_BUILTIN_BSWAP16__
-   return __builtin_bswap16(val);
-#elif defined (__arch_swab16)
+#if defined (__arch_swab16)
return __arch_swab16(val);
 #else
return ___constant_swab16(val);
@@ -56,9 +54,7 @@ static inline __attribute_const__ __u16 __fswab16(__u16 val)
 
 static inline __attribute_const__ __u32 __fswab32(__u32 val)
 {
-#ifdef __HAVE_BUILTIN_BSWAP32__
-   return __builtin_bswap32(val);
-#elif defined(__arch_swab32)
+#if defined(__arch_swab32)
return __arch_swab32(val);
 #else
return ___constant_swab32(val);
@@ -67,9 +63,7 @@ static inline __attribute_const__ __u32 __fswab32(__u32 val)
 
 static inline __attribute_const__ __u64 __fswab64(__u64 val)
 {
-#ifdef __HAVE_BUILTIN_BSWAP64__
-   return __builtin_bswap64(val);
-#elif defined (__arch_swab64)
+#if defined (__arch_swab64)
return __arch_swab64(val);
 #elif defined(__SWAB_64_THRU_32__)
__u32 h = val >> 32;
@@ -102,28 +96,40 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 
val)
  * __swab16 - return a byteswapped 16-bit value
  * @x: value to byteswap
  */
+#ifdef __HAVE_BUILTIN_BSWAP16__
+#define __swab16(x) __builtin_bswap16((__u16)(x))
+#else
 #define __swab16(x)\
(__builtin_constant_p((__u16)(x)) ? \
___constant_swab16(x) : \
__fswab16(x))
+#endif
 
 /**
  * __swab32 - return a byteswapped 32-bit value
  * @x: value to byteswap
  */
+#ifdef __HAVE_BUILTIN_BSWAP32__
+#define __swab32(x) __builtin_bswap32((__u32)(x))
+#else
 #define __swab32(x)\
(__builtin_constant_p((__u32)(x)) ? \
___constant_swab32(x) : \
__fswab32(x))
+#endif
 
 /**
  * __swab64 - return a byteswapped 64-bit value
  * @x: value to byteswap
  */
+#ifdef __HAVE_BUILTIN_BSWAP64__
+#define __swab64(x) __builtin_bswap64((__u64)(x))
+#else
 #define __swab64(x)\
(__builtin_constant_p((__u64)(x)) ? \
___constant_swab64(x) : \
__fswab64(x))
+#endif
 
 /**
  * __swahw32 - return a word-swapped 32-bit value



Re: [PATCH] media: fix media_ioctl use-after-free when driver unbinds

2016-04-27 Thread Shuah Khan
On 04/27/2016 10:43 AM, Lars-Peter Clausen wrote:
> Looks mostly good, a few comments.
> 
> On 04/27/2016 05:08 AM, Shuah Khan wrote:
> [...]
>> @@ -428,7 +428,7 @@ static long media_device_ioctl(struct file *filp, 
>> unsigned int cmd,
>> unsigned long arg)
>>  {
>>  struct media_devnode *devnode = media_devnode_data(filp);
>> -struct media_device *dev = to_media_device(devnode);
> 
> Can we keep the helper macro, means we don't need to touch this code.

Yeah. I have been thinking about that as well. It avoids changes
and abstracts it.

> 
>> +struct media_device *dev = devnode->media_dev;
> 
> You need a lock to protect this from running concurrently with
> media_device_unregister() otherwise the struct might be freed while still in
> use.
> 

Right. This needs to be protected.

>>  long ret;
>>  
>>  switch (cmd) {
> [...]
>> @@ -725,21 +726,26 @@ int __must_check __media_device_register(struct 
>> media_device *mdev,
>>  {
>>  int ret;
>>  
>> +mdev->devnode = kzalloc(sizeof(struct media_devnode), GFP_KERNEL);
> 
> sizeof(*mdev->devnode) is preferred kernel style,

Yeah. Force of habit, I keep forgetting it.

> 
>> +if (!mdev->devnode)
>> +return -ENOMEM;
>> +
>>  /* Register the device node. */
>> -mdev->devnode.fops = _device_fops;
>> -mdev->devnode.parent = mdev->dev;
>> -mdev->devnode.release = media_device_release;
>> +mdev->devnode->fops = _device_fops;
>> +mdev->devnode->parent = mdev->dev;
>> +mdev->devnode->media_dev = mdev;
>> +mdev->devnode->release = media_device_release;
> 
> This should no longer be necessary. Just drop the release callback altogether.

It does nothing at the moment. I believe the intent is for this routine
to invoke any driver hooks if any at media_device level. It gets called
from media_devnode_release() which is the media_devnode->dev.release.
I will look into if it can be removed.

> 
>>  
>>  /* Set version 0 to indicate user-space that the graph is static */
>>  mdev->topology_version = 0;
>>  
> [...]
>> @@ -813,8 +819,10 @@ void media_device_unregister(struct media_device *mdev)
>>  
>>  spin_unlock(>lock);
>>  
>> -device_remove_file(>devnode.dev, _attr_model);
>> -media_devnode_unregister(>devnode);
>> +device_remove_file(>devnode->dev, _attr_model);
>> +media_devnode_unregister(mdev->devnode);
>> +/* kfree devnode is done via kobject_put() handler */
>> +mdev->devnode = NULL;
> 
> mdev->devnode->media_dev needs to be set to NULL.

Yes. Thanks for catching it.

> 
>>  
>>  dev_dbg(mdev->dev, "Media device unregistered\n");
>>  }
>> diff --git a/drivers/media/media-devnode.c b/drivers/media/media-devnode.c
>> index 29409f4..9af9ba1 100644
>> --- a/drivers/media/media-devnode.c
>> +++ b/drivers/media/media-devnode.c
>> @@ -171,6 +171,9 @@ static int media_open(struct inode *inode, struct file 
>> *filp)
>>  mutex_unlock(_devnode_lock);
>>  return -ENXIO;
>>  }
>> +
>> +kobject_get(>kobj);
> 
> This is not necessary, and if it was it would be prone to race condition as
> the last reference could be dropped before this line. But assigning the cdev
> parent makes sure that we always have a reference to the object while the
> open() callback is running.

I don't see cdev parent kobj get in cdev_get() which does kobject_get()
on cdev->kobj. Is that enough to get the reference?

cdev_add() gets the cdev parent kobj and cdev_del() puts it back. That is
the reason why I added a get here and put in media_release().

I can remove the get and put and test. Looks like I am not checking
kobject_get() return value which isn't good?

> 
>> +
>>  /* and increase the device refcount */
>>  get_device(>dev);
>>  mutex_unlock(_devnode_lock);
>>  /*
> [...]
>> diff --git a/include/media/media-devnode.h b/include/media/media-devnode.h
>> index fe42f08..ba4bdaa 100644
>> --- a/include/media/media-devnode.h
>> +++ b/include/media/media-devnode.h
>> @@ -70,7 +70,9 @@ struct media_file_operations {
>>   * @fops:   pointer to struct _file_operations with media device ops
>>   * @dev:struct device pointer for the media controller device
>>   * @cdev:   struct cdev pointer character device
>> + * @kobj:   struct kobject
>>   * @parent: parent device
>> + * @media_dev:  media device
>>   * @minor:  device node minor number
>>   * @flags:  flags, combination of the MEDIA_FLAG_* constants
>>   * @release:release callback called at the end of 
>> media_devnode_release()
>> @@ -87,7 +89,9 @@ struct media_devnode {
>>  /* sysfs */
>>  struct device dev;  /* media device */
>>  struct cdev cdev;   /* character device */
>> +struct kobject kobj;/* set as cdev parent kobj */
> 
> You don't need a extra kobj. Just use the struct dev kobj.

Yeah I can use that as long as I can override the default release
function with media_devnode_free(). 

Re: [PATCH] media: fix media_ioctl use-after-free when driver unbinds

2016-04-27 Thread Shuah Khan
On 04/27/2016 10:43 AM, Lars-Peter Clausen wrote:
> Looks mostly good, a few comments.
> 
> On 04/27/2016 05:08 AM, Shuah Khan wrote:
> [...]
>> @@ -428,7 +428,7 @@ static long media_device_ioctl(struct file *filp, 
>> unsigned int cmd,
>> unsigned long arg)
>>  {
>>  struct media_devnode *devnode = media_devnode_data(filp);
>> -struct media_device *dev = to_media_device(devnode);
> 
> Can we keep the helper macro, means we don't need to touch this code.

Yeah. I have been thinking about that as well. It avoids changes
and abstracts it.

> 
>> +struct media_device *dev = devnode->media_dev;
> 
> You need a lock to protect this from running concurrently with
> media_device_unregister() otherwise the struct might be freed while still in
> use.
> 

Right. This needs to be protected.

>>  long ret;
>>  
>>  switch (cmd) {
> [...]
>> @@ -725,21 +726,26 @@ int __must_check __media_device_register(struct 
>> media_device *mdev,
>>  {
>>  int ret;
>>  
>> +mdev->devnode = kzalloc(sizeof(struct media_devnode), GFP_KERNEL);
> 
> sizeof(*mdev->devnode) is preferred kernel style,

Yeah. Force of habit, I keep forgetting it.

> 
>> +if (!mdev->devnode)
>> +return -ENOMEM;
>> +
>>  /* Register the device node. */
>> -mdev->devnode.fops = _device_fops;
>> -mdev->devnode.parent = mdev->dev;
>> -mdev->devnode.release = media_device_release;
>> +mdev->devnode->fops = _device_fops;
>> +mdev->devnode->parent = mdev->dev;
>> +mdev->devnode->media_dev = mdev;
>> +mdev->devnode->release = media_device_release;
> 
> This should no longer be necessary. Just drop the release callback altogether.

It does nothing at the moment. I believe the intent is for this routine
to invoke any driver hooks if any at media_device level. It gets called
from media_devnode_release() which is the media_devnode->dev.release.
I will look into if it can be removed.

> 
>>  
>>  /* Set version 0 to indicate user-space that the graph is static */
>>  mdev->topology_version = 0;
>>  
> [...]
>> @@ -813,8 +819,10 @@ void media_device_unregister(struct media_device *mdev)
>>  
>>  spin_unlock(>lock);
>>  
>> -device_remove_file(>devnode.dev, _attr_model);
>> -media_devnode_unregister(>devnode);
>> +device_remove_file(>devnode->dev, _attr_model);
>> +media_devnode_unregister(mdev->devnode);
>> +/* kfree devnode is done via kobject_put() handler */
>> +mdev->devnode = NULL;
> 
> mdev->devnode->media_dev needs to be set to NULL.

Yes. Thanks for catching it.

> 
>>  
>>  dev_dbg(mdev->dev, "Media device unregistered\n");
>>  }
>> diff --git a/drivers/media/media-devnode.c b/drivers/media/media-devnode.c
>> index 29409f4..9af9ba1 100644
>> --- a/drivers/media/media-devnode.c
>> +++ b/drivers/media/media-devnode.c
>> @@ -171,6 +171,9 @@ static int media_open(struct inode *inode, struct file 
>> *filp)
>>  mutex_unlock(_devnode_lock);
>>  return -ENXIO;
>>  }
>> +
>> +kobject_get(>kobj);
> 
> This is not necessary, and if it was it would be prone to race condition as
> the last reference could be dropped before this line. But assigning the cdev
> parent makes sure that we always have a reference to the object while the
> open() callback is running.

I don't see cdev parent kobj get in cdev_get() which does kobject_get()
on cdev->kobj. Is that enough to get the reference?

cdev_add() gets the cdev parent kobj and cdev_del() puts it back. That is
the reason why I added a get here and put in media_release().

I can remove the get and put and test. Looks like I am not checking
kobject_get() return value which isn't good?

> 
>> +
>>  /* and increase the device refcount */
>>  get_device(>dev);
>>  mutex_unlock(_devnode_lock);
>>  /*
> [...]
>> diff --git a/include/media/media-devnode.h b/include/media/media-devnode.h
>> index fe42f08..ba4bdaa 100644
>> --- a/include/media/media-devnode.h
>> +++ b/include/media/media-devnode.h
>> @@ -70,7 +70,9 @@ struct media_file_operations {
>>   * @fops:   pointer to struct _file_operations with media device ops
>>   * @dev:struct device pointer for the media controller device
>>   * @cdev:   struct cdev pointer character device
>> + * @kobj:   struct kobject
>>   * @parent: parent device
>> + * @media_dev:  media device
>>   * @minor:  device node minor number
>>   * @flags:  flags, combination of the MEDIA_FLAG_* constants
>>   * @release:release callback called at the end of 
>> media_devnode_release()
>> @@ -87,7 +89,9 @@ struct media_devnode {
>>  /* sysfs */
>>  struct device dev;  /* media device */
>>  struct cdev cdev;   /* character device */
>> +struct kobject kobj;/* set as cdev parent kobj */
> 
> You don't need a extra kobj. Just use the struct dev kobj.

Yeah I can use that as long as I can override the default release
function with media_devnode_free(). 

Re: [PATCH v4 1/7] regulator: rk808: Add rk808_reg_ops_ranges for LDO3

2016-04-27 Thread Heiko Stübner
Am Dienstag, 26. April 2016, 16:54:04 schrieb Wadim Egorov:
> LDO_REG3 descriptor is using linear_ranges.
> Add and use proper ops for LDO_REG3.
> 
> Signed-off-by: Wadim Egorov 

I'm to late to the party judging by Mark's "Aplied" message, but just to 
confirm, this patch sucessfully fixes the issue on veyron-devices reported 
yesterday.


Heiko


Re: [PATCH v4 1/7] regulator: rk808: Add rk808_reg_ops_ranges for LDO3

2016-04-27 Thread Heiko Stübner
Am Dienstag, 26. April 2016, 16:54:04 schrieb Wadim Egorov:
> LDO_REG3 descriptor is using linear_ranges.
> Add and use proper ops for LDO_REG3.
> 
> Signed-off-by: Wadim Egorov 

I'm to late to the party judging by Mark's "Aplied" message, but just to 
confirm, this patch sucessfully fixes the issue on veyron-devices reported 
yesterday.


Heiko


Re: [PATCH 4/4] thermal: bang-bang governor: act on lower trip boundary

2016-04-27 Thread Eduardo Valentin
On Mon, Apr 25, 2016 at 11:02:47AM +0800, Caesar Wang wrote:
> From: Sascha Hauer 
> 
> With interrupt driven thermal zones we pass the lower and upper
> temperature on which shall be acted, so in the governor we have to act on
> the exact lower temperature to be consistent. Otherwise an interrupt maybe
> generated on the exact lower temperature, but the bang bang governor does
> not react.

What is the expected impact on polling driven zones that use bang bang
after this change?

> 
> Signed-off-by: Sascha Hauer 
> Signed-off-by: Caesar Wang 
> Cc: Zhang Rui 
> Cc: Eduardo Valentin 
> Cc: linux...@vger.kernel.org
> 
> ---
> 
>  drivers/thermal/gov_bang_bang.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c
> index 70836c5..9d1dfea 100644
> --- a/drivers/thermal/gov_bang_bang.c
> +++ b/drivers/thermal/gov_bang_bang.c
> @@ -59,7 +59,7 @@ static void thermal_zone_trip_update(struct 
> thermal_zone_device *tz, int trip)
>   if (instance->target == 0 && tz->temperature >= trip_temp)
>   instance->target = 1;
>   else if (instance->target == 1 &&
> - tz->temperature < trip_temp - trip_hyst)
> + tz->temperature <= trip_temp - trip_hyst)
>   instance->target = 0;
>  
>   dev_dbg(>cdev->device, "target=%d\n",
> -- 
> 1.9.1
> 


Re: [PATCH 4/4] thermal: bang-bang governor: act on lower trip boundary

2016-04-27 Thread Eduardo Valentin
On Mon, Apr 25, 2016 at 11:02:47AM +0800, Caesar Wang wrote:
> From: Sascha Hauer 
> 
> With interrupt driven thermal zones we pass the lower and upper
> temperature on which shall be acted, so in the governor we have to act on
> the exact lower temperature to be consistent. Otherwise an interrupt maybe
> generated on the exact lower temperature, but the bang bang governor does
> not react.

What is the expected impact on polling driven zones that use bang bang
after this change?

> 
> Signed-off-by: Sascha Hauer 
> Signed-off-by: Caesar Wang 
> Cc: Zhang Rui 
> Cc: Eduardo Valentin 
> Cc: linux...@vger.kernel.org
> 
> ---
> 
>  drivers/thermal/gov_bang_bang.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c
> index 70836c5..9d1dfea 100644
> --- a/drivers/thermal/gov_bang_bang.c
> +++ b/drivers/thermal/gov_bang_bang.c
> @@ -59,7 +59,7 @@ static void thermal_zone_trip_update(struct 
> thermal_zone_device *tz, int trip)
>   if (instance->target == 0 && tz->temperature >= trip_temp)
>   instance->target = 1;
>   else if (instance->target == 1 &&
> - tz->temperature < trip_temp - trip_hyst)
> + tz->temperature <= trip_temp - trip_hyst)
>   instance->target = 0;
>  
>   dev_dbg(>cdev->device, "target=%d\n",
> -- 
> 1.9.1
> 


Re: [PATCH 2/4] thermal: of: implement .set_trips for device tree thermal zones

2016-04-27 Thread Eduardo Valentin
On Mon, Apr 25, 2016 at 11:02:45AM +0800, Caesar Wang wrote:
> From: Sascha Hauer 
> 
> Signed-off-by: Sascha Hauer 
> Signed-off-by: Caesar Wang 
> Cc: Zhang Rui 
> Cc: Eduardo Valentin 
> Cc: linux...@vger.kernel.org
> ---
> 
>  drivers/thermal/of-thermal.c | 12 
>  include/linux/thermal.h  |  4 
>  2 files changed, 16 insertions(+)
> 
> diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c
> index b8e509c..8722e63 100644
> --- a/drivers/thermal/of-thermal.c
> +++ b/drivers/thermal/of-thermal.c
> @@ -101,6 +101,17 @@ static int of_thermal_get_temp(struct 
> thermal_zone_device *tz,
>   return data->ops->get_temp(data->sensor_data, temp);
>  }
>  
> +static int of_thermal_set_trips(struct thermal_zone_device *tz,
> + int low, int high)
> +{
> + struct __thermal_zone *data = tz->devdata;
> +
> + if (!data->ops || !data->ops->set_trips)
> + return -EINVAL;
> +
> + return data->ops->set_trips(data->sensor_data, low, high);
> +}
> +
>  /**
>   * of_thermal_get_ntrips - function to export number of available trip
>   *  points.
> @@ -427,6 +438,7 @@ thermal_zone_of_add_sensor(struct device_node *zone,
>  
>   tzd->ops->get_temp = of_thermal_get_temp;
>   tzd->ops->get_trend = of_thermal_get_trend;
> + tzd->ops->set_trips = of_thermal_set_trips;
>   tzd->ops->set_emul_temp = of_thermal_set_emul_temp;
>   mutex_unlock(>lock);
>  
> diff --git a/include/linux/thermal.h b/include/linux/thermal.h
> index e258359..cb64866 100644
> --- a/include/linux/thermal.h
> +++ b/include/linux/thermal.h
> @@ -336,12 +336,16 @@ struct thermal_genl_event {
>   *
>   * Optional:
>   * @get_trend: a pointer to a function that reads the sensor temperature 
> trend.
> + * @@set_trips: a pointer to a function that sets a temperature window. When
> + *   this window is left the driver must inform the thermal core via
> + *  thermal_zone_device_update.

Ok. We start to see some documentation and expectation being stated
here. Nice. Please respin the comment on thermal core too, so drivers
that dont use OF will also be aware of this feature and how to use them.

>   * @set_emul_temp: a pointer to a function that sets sensor emulated
>   *  temperature.
>   */
>  struct thermal_zone_of_device_ops {
>   int (*get_temp)(void *, int *);
>   int (*get_trend)(void *, long *);
> + int (*set_trips)(void *, int, int);
>   int (*set_emul_temp)(void *, int);
>   int (*set_trip_temp)(void *, int, int);
>  };
> -- 
> 1.9.1
> 


Re: [PATCH 2/4] thermal: of: implement .set_trips for device tree thermal zones

2016-04-27 Thread Eduardo Valentin
On Mon, Apr 25, 2016 at 11:02:45AM +0800, Caesar Wang wrote:
> From: Sascha Hauer 
> 
> Signed-off-by: Sascha Hauer 
> Signed-off-by: Caesar Wang 
> Cc: Zhang Rui 
> Cc: Eduardo Valentin 
> Cc: linux...@vger.kernel.org
> ---
> 
>  drivers/thermal/of-thermal.c | 12 
>  include/linux/thermal.h  |  4 
>  2 files changed, 16 insertions(+)
> 
> diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c
> index b8e509c..8722e63 100644
> --- a/drivers/thermal/of-thermal.c
> +++ b/drivers/thermal/of-thermal.c
> @@ -101,6 +101,17 @@ static int of_thermal_get_temp(struct 
> thermal_zone_device *tz,
>   return data->ops->get_temp(data->sensor_data, temp);
>  }
>  
> +static int of_thermal_set_trips(struct thermal_zone_device *tz,
> + int low, int high)
> +{
> + struct __thermal_zone *data = tz->devdata;
> +
> + if (!data->ops || !data->ops->set_trips)
> + return -EINVAL;
> +
> + return data->ops->set_trips(data->sensor_data, low, high);
> +}
> +
>  /**
>   * of_thermal_get_ntrips - function to export number of available trip
>   *  points.
> @@ -427,6 +438,7 @@ thermal_zone_of_add_sensor(struct device_node *zone,
>  
>   tzd->ops->get_temp = of_thermal_get_temp;
>   tzd->ops->get_trend = of_thermal_get_trend;
> + tzd->ops->set_trips = of_thermal_set_trips;
>   tzd->ops->set_emul_temp = of_thermal_set_emul_temp;
>   mutex_unlock(>lock);
>  
> diff --git a/include/linux/thermal.h b/include/linux/thermal.h
> index e258359..cb64866 100644
> --- a/include/linux/thermal.h
> +++ b/include/linux/thermal.h
> @@ -336,12 +336,16 @@ struct thermal_genl_event {
>   *
>   * Optional:
>   * @get_trend: a pointer to a function that reads the sensor temperature 
> trend.
> + * @@set_trips: a pointer to a function that sets a temperature window. When
> + *   this window is left the driver must inform the thermal core via
> + *  thermal_zone_device_update.

Ok. We start to see some documentation and expectation being stated
here. Nice. Please respin the comment on thermal core too, so drivers
that dont use OF will also be aware of this feature and how to use them.

>   * @set_emul_temp: a pointer to a function that sets sensor emulated
>   *  temperature.
>   */
>  struct thermal_zone_of_device_ops {
>   int (*get_temp)(void *, int *);
>   int (*get_trend)(void *, long *);
> + int (*set_trips)(void *, int, int);
>   int (*set_emul_temp)(void *, int);
>   int (*set_trip_temp)(void *, int, int);
>  };
> -- 
> 1.9.1
> 


Re: [PATCH 7/9] thermal: of: Add support for hardware-tracked trip points

2016-04-27 Thread Eduardo Valentin
On Fri, Apr 22, 2016 at 06:17:54PM +0800, Caesar Wang wrote:
> Hi Sascha,
> These are still the newest patches. I won't have any resources in the
> near future for continuing the work on them, so feel free to pick them
> up. There hasn't been much discussion around these patches which was the
> reason I abandoned them.

Yes, this is correct. I unfortunately, left those to fall into the
cracks. Overall, I liked the idea, but never got the time to give Sascha
the feedback on minor changes I wanted.


> 
> Okay.
> 
> I start to pick them up and do some tests in my github.
> https://github.com/Caesar-github/rockchip/commits/wip/support-thermal-hardware-trip-points
> 

Ok. I will follow up on this then.

> _
> Caesar
> 
> >
> >Sascha
> >
> >
> 
> -- 
> Thanks,
> Caesar
> 


Re: [PATCH 7/9] thermal: of: Add support for hardware-tracked trip points

2016-04-27 Thread Eduardo Valentin
On Fri, Apr 22, 2016 at 06:17:54PM +0800, Caesar Wang wrote:
> Hi Sascha,
> These are still the newest patches. I won't have any resources in the
> near future for continuing the work on them, so feel free to pick them
> up. There hasn't been much discussion around these patches which was the
> reason I abandoned them.

Yes, this is correct. I unfortunately, left those to fall into the
cracks. Overall, I liked the idea, but never got the time to give Sascha
the feedback on minor changes I wanted.


> 
> Okay.
> 
> I start to pick them up and do some tests in my github.
> https://github.com/Caesar-github/rockchip/commits/wip/support-thermal-hardware-trip-points
> 

Ok. I will follow up on this then.

> _
> Caesar
> 
> >
> >Sascha
> >
> >
> 
> -- 
> Thanks,
> Caesar
> 


Re: [PATCH 1/4] thermal: Add support for hardware-tracked trip points

2016-04-27 Thread Eduardo Valentin
A couple of comments as follows,

On Mon, Apr 25, 2016 at 11:02:44AM +0800, Caesar Wang wrote:
> From: Sascha Hauer 
> 
> This adds support for hardware-tracked trip points to the device tree
> thermal sensor framework.
> 
> The framework supports an arbitrary number of trip points. Whenever
> the current temperature is updated, the trip points immediately
> below and above the current temperature are found. A .set_trips
> callback is then called with the temperatures. If there is no trip
> point above or below the current temperature, the passed trip
> temperature will be -INT_MAX or INT_MAX respectively. In this callback,
> the driver should program the hardware such that it is notified
> when either of these trip points are triggered. When a trip point
> is triggered, the driver should call `thermal_zone_device_update'
> for the respective thermal zone. This will cause the trip points
> to be updated again.
> 
> If .set_trips is not implemented, the framework behaves as before.
> 
> This patch is based on an earlier version from Mikko Perttunen
> 
> 
> Signed-off-by: Sascha Hauer 
> Signed-off-by: Caesar Wang 
> Cc: Zhang Rui 
> Cc: Eduardo Valentin 
> Cc: linux...@vger.kernel.org
> ---
> 
>  drivers/thermal/thermal_core.c | 48 
> ++
>  include/linux/thermal.h|  3 +++

Given that this is adding a new feature in the framework, I would prefer
if you could also describe the .set_trips() in the sysfs-api.txt
documentation file.

A short description of the expectation of what the framework is going to
do is also welcome. For example, are drivers supposed to setup the
polling together with the threshold based approach?

>  2 files changed, 51 insertions(+)
> 
> diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
> index f1db496..cfef8cc 100644
> --- a/drivers/thermal/thermal_core.c
> +++ b/drivers/thermal/thermal_core.c
> @@ -520,6 +520,47 @@ exit:
>  }
>  EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
>  
> +static void thermal_zone_set_trips(struct thermal_zone_device *tz)
> +{
> + int low = -INT_MAX;
> + int high = INT_MAX;
> + int trip_temp, hysteresis;
> + int temp = tz->temperature;
> + int i, ret;
> +
> + if (!tz->ops->set_trips)
> + return;
> +
> + for (i = 0; i < tz->trips; i++) {
> + int trip_low;
> +
> + tz->ops->get_trip_temp(tz, i, _temp);
> + tz->ops->get_trip_hyst(tz, i, );
> +
> + trip_low = trip_temp - hysteresis;
> +
> + if (trip_low < temp && trip_low > low)
> + low = trip_low;
> +
> + if (trip_temp > temp && trip_temp < high)
> + high = trip_temp;
> + }

Did I understand correctly that you will be flooded by IRQs when you
have:
1. One single trip point.
2. Your temp is above trip_temp

With the above, you would program as threshold:
high == trip_temp
low == trip_temp - hyst

And the IRQ would fire immediattely, causing a device update, causing a
reprogramming, causing another irq, and this would continue, until the
temperature goes below trip_temp, right?

> +
> + /* No need to change trip points */
> + if (tz->prev_low_trip == low && tz->prev_high_trip == high)
> + return;
> +
> + tz->prev_low_trip = low;
> + tz->prev_high_trip = high;
> +
> + dev_dbg(>device, "new temperature boundaries: %d < x < %d\n",
> + low, high);
> +
> + ret = tz->ops->set_trips(tz, low, high);
> + if (ret)
> + dev_err(>device, "Failed to set trips: %d\n", ret);
> +}
> +
>  static void update_temperature(struct thermal_zone_device *tz)
>  {
>   int temp, ret;
> @@ -569,6 +610,8 @@ void thermal_zone_device_update(struct 
> thermal_zone_device *tz)
>  
>   update_temperature(tz);
>  
> + thermal_zone_set_trips(tz);
> +
>   for (count = 0; count < tz->trips; count++)
>   handle_thermal_trip(tz, count);
>  }
> @@ -754,6 +797,9 @@ trip_point_hyst_store(struct device *dev, struct 
> device_attribute *attr,
>*/
>   ret = tz->ops->set_trip_hyst(tz, trip, temperature);
>  
> + if (!ret)
> + thermal_zone_set_trips(tz);
> +

You would probably want to do the same on trip_point_temp_store().

>   return ret ? ret : count;
>  }
>  
> @@ -1843,6 +1889,8 @@ struct thermal_zone_device 
> *thermal_zone_device_register(const char *type,
>   tz->trips = trips;
>   tz->passive_delay = passive_delay;
>   tz->polling_delay = polling_delay;
> + tz->prev_low_trip = INT_MAX;
> + tz->prev_high_trip = -INT_MAX;
>   /* A new thermal zone needs to be updated anyway. */
>   atomic_set(>need_update, 1);
>  
> diff --git a/include/linux/thermal.h b/include/linux/thermal.h
> index e45abe7..e258359 100644
> --- a/include/linux/thermal.h

Re: [PATCH 1/4] thermal: Add support for hardware-tracked trip points

2016-04-27 Thread Eduardo Valentin
A couple of comments as follows,

On Mon, Apr 25, 2016 at 11:02:44AM +0800, Caesar Wang wrote:
> From: Sascha Hauer 
> 
> This adds support for hardware-tracked trip points to the device tree
> thermal sensor framework.
> 
> The framework supports an arbitrary number of trip points. Whenever
> the current temperature is updated, the trip points immediately
> below and above the current temperature are found. A .set_trips
> callback is then called with the temperatures. If there is no trip
> point above or below the current temperature, the passed trip
> temperature will be -INT_MAX or INT_MAX respectively. In this callback,
> the driver should program the hardware such that it is notified
> when either of these trip points are triggered. When a trip point
> is triggered, the driver should call `thermal_zone_device_update'
> for the respective thermal zone. This will cause the trip points
> to be updated again.
> 
> If .set_trips is not implemented, the framework behaves as before.
> 
> This patch is based on an earlier version from Mikko Perttunen
> 
> 
> Signed-off-by: Sascha Hauer 
> Signed-off-by: Caesar Wang 
> Cc: Zhang Rui 
> Cc: Eduardo Valentin 
> Cc: linux...@vger.kernel.org
> ---
> 
>  drivers/thermal/thermal_core.c | 48 
> ++
>  include/linux/thermal.h|  3 +++

Given that this is adding a new feature in the framework, I would prefer
if you could also describe the .set_trips() in the sysfs-api.txt
documentation file.

A short description of the expectation of what the framework is going to
do is also welcome. For example, are drivers supposed to setup the
polling together with the threshold based approach?

>  2 files changed, 51 insertions(+)
> 
> diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
> index f1db496..cfef8cc 100644
> --- a/drivers/thermal/thermal_core.c
> +++ b/drivers/thermal/thermal_core.c
> @@ -520,6 +520,47 @@ exit:
>  }
>  EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
>  
> +static void thermal_zone_set_trips(struct thermal_zone_device *tz)
> +{
> + int low = -INT_MAX;
> + int high = INT_MAX;
> + int trip_temp, hysteresis;
> + int temp = tz->temperature;
> + int i, ret;
> +
> + if (!tz->ops->set_trips)
> + return;
> +
> + for (i = 0; i < tz->trips; i++) {
> + int trip_low;
> +
> + tz->ops->get_trip_temp(tz, i, _temp);
> + tz->ops->get_trip_hyst(tz, i, );
> +
> + trip_low = trip_temp - hysteresis;
> +
> + if (trip_low < temp && trip_low > low)
> + low = trip_low;
> +
> + if (trip_temp > temp && trip_temp < high)
> + high = trip_temp;
> + }

Did I understand correctly that you will be flooded by IRQs when you
have:
1. One single trip point.
2. Your temp is above trip_temp

With the above, you would program as threshold:
high == trip_temp
low == trip_temp - hyst

And the IRQ would fire immediattely, causing a device update, causing a
reprogramming, causing another irq, and this would continue, until the
temperature goes below trip_temp, right?

> +
> + /* No need to change trip points */
> + if (tz->prev_low_trip == low && tz->prev_high_trip == high)
> + return;
> +
> + tz->prev_low_trip = low;
> + tz->prev_high_trip = high;
> +
> + dev_dbg(>device, "new temperature boundaries: %d < x < %d\n",
> + low, high);
> +
> + ret = tz->ops->set_trips(tz, low, high);
> + if (ret)
> + dev_err(>device, "Failed to set trips: %d\n", ret);
> +}
> +
>  static void update_temperature(struct thermal_zone_device *tz)
>  {
>   int temp, ret;
> @@ -569,6 +610,8 @@ void thermal_zone_device_update(struct 
> thermal_zone_device *tz)
>  
>   update_temperature(tz);
>  
> + thermal_zone_set_trips(tz);
> +
>   for (count = 0; count < tz->trips; count++)
>   handle_thermal_trip(tz, count);
>  }
> @@ -754,6 +797,9 @@ trip_point_hyst_store(struct device *dev, struct 
> device_attribute *attr,
>*/
>   ret = tz->ops->set_trip_hyst(tz, trip, temperature);
>  
> + if (!ret)
> + thermal_zone_set_trips(tz);
> +

You would probably want to do the same on trip_point_temp_store().

>   return ret ? ret : count;
>  }
>  
> @@ -1843,6 +1889,8 @@ struct thermal_zone_device 
> *thermal_zone_device_register(const char *type,
>   tz->trips = trips;
>   tz->passive_delay = passive_delay;
>   tz->polling_delay = polling_delay;
> + tz->prev_low_trip = INT_MAX;
> + tz->prev_high_trip = -INT_MAX;
>   /* A new thermal zone needs to be updated anyway. */
>   atomic_set(>need_update, 1);
>  
> diff --git a/include/linux/thermal.h b/include/linux/thermal.h
> index e45abe7..e258359 100644
> --- a/include/linux/thermal.h
> +++ b/include/linux/thermal.h
> @@ -98,6 +98,7 @@ struct thermal_zone_device_ops {
>   int (*unbind) (struct thermal_zone_device 

Re: [RFC 1/4] perf kvm: Enable 'record' on powerpc

2016-04-27 Thread Arnaldo Carvalho de Melo
Em Wed, Apr 27, 2016 at 06:02:21PM +0530, Ravi Bangoria escreveu:
> Hi Arnaldo,
> 
> I've worked on your patch. I'm sending this patch(diff) to check if this
> is the same idea you want to progress with. I cleanup your patch,
> removed arch specific compile time directives and changed code to
> enable cross arch reporting. I tested record on powerpc and report
> on x86 and it's working.
> 
> Please give suggestion about your approach. Let me know if you have
> some other idea to progress with.
> 
> Here is the diff w.r.t perf/cpumode branch:
> 
> diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
> index bff6664..83ef6c6 100644
> --- a/tools/perf/builtin-kvm.c
> +++ b/tools/perf/builtin-kvm.c
> @@ -1480,6 +1480,60 @@ perf_stat:
>  }
>  #endif /* HAVE_KVM_STAT_SUPPORT */
> 
> +#define PPC_HV_DECREMENTER 2432
> +#define PPC_HV_BIT 3
> +#define PPC_PR_BIT 49
> +#define PPC_MAX 63
> +
> +static bool perf_sample__is_hv_dec_trap(struct perf_sample *sample, struct
> perf_evsel *evsel)
> +{
> +int trap = perf_evsel__intval(evsel, sample, "trap");
> +return trap == PPC_HV_DECREMENTER;
> +}
> +
> +static void perf_kvm__munge_ppc_guest_sample(struct perf_evsel *evsel,
> struct perf_sample *sample)
> +{
> +unsigned long msr, hv, pr;
> +
> +if (!perf_sample__is_hv_dec_trap(sample, evsel))
> +return;
> +
> +sample->ip = perf_evsel__intval(evsel, sample, "pc");
> +sample->cpumode = PERF_RECORD_MISC_GUEST_KERNEL;
> +
> +msr = perf_evsel__intval(evsel, sample, "msr");
> +hv = msr & ((unsigned long)1 << (PPC_MAX - PPC_HV_BIT));
> +pr = msr & ((unsigned long)1 << (PPC_MAX - PPC_PR_BIT));
> +if (!hv && pr)
> +sample->cpumode = PERF_RECORD_MISC_GUEST_USER;
> +}
> +
> +static bool perf_evlist__recorded_on_ppc(const struct perf_evlist *evlist)
> +{
> +if (evlist->env && evlist->env->arch) {
> +return !strcmp(evlist->env->arch, "ppc64") ||
> +   !strcmp(evlist->env->arch, "ppc64le");
> +}
> +return false;
> +}
> +
> +int perf_kvm__setup_munge_ppc_guest_event(struct perf_evlist *evlist)
> +{
> +struct perf_evsel *evsel;
> +const char name[] = "kvm_hv:kvm_guest_exit";
> +
> +if (!perf_evlist__recorded_on_ppc(evlist))
> +return 0;
> +
> +evsel = perf_evlist__find_tracepoint_by_name(evlist, name);
> +if (evsel == NULL)
> +return -1;
> +
> +evsel->munge_sample = perf_kvm__munge_ppc_guest_sample;
> +
> +return 0;
> +}
> +
>  static int __cmd_record(const char *file_name, int argc, const char **argv)
>  {
>  int rec_argc, i = 0, j;
> diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> index ab47273..7cb41f7 100644
> --- a/tools/perf/builtin-report.c
> +++ b/tools/perf/builtin-report.c
> @@ -879,6 +879,12 @@ repeat:
>  if (session == NULL)
>  return -1;
> 
> +if (perf_guest &&
> +perf_kvm__setup_munge_ppc_guest_event(session->evlist)) {
> +pr_err("PPC event not present in %s file\n", input_name);
> +goto error;
> +}

This looks out of place, i.e. this reads: "For all cases where there is
a guest and we can't setup the ppc KVM guest related stuff, its an
error"

I think this gets clearer as:

if (perf_guest && perf_evlist__recorded_on_ppc(evlist) &&
perf_kvm__setup_munge_ppc_guest_event(session->evlist)) {
pr_err("PPC event not present in %s file\n", input_name);
goto error;
}

Then we read this as "Hey, if this was recorded on ppc, try to set
things up for ppc", but then again, what is this KVM stuff doing in the
generic 'perf report' code? 

What if this is a perf.data file generated on PPC but being read on PPC?
This will not make sense to munge it, right?

This is with what I remember from this case, please bear with me.

- Arnaldo

> +
>  if (report.queue_size) {
> ordered_events__set_alloc_size(>ordered_events,
> report.queue_size);
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 738ce22..1665171 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -216,6 +216,7 @@ void perf_evsel__init(struct perf_evsel *evsel,
>  evsel->sample_size = __perf_evsel__sample_size(attr->sample_type);
>  perf_evsel__calc_id_pos(evsel);
>  evsel->cmdline_group_boundary = false;
> +evsel->munge_sample = NULL;
>  }
> 
>  struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int
> idx)
> @@ -1887,6 +1888,9 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel,
> union perf_event *event,
>  }
>  }
> 
> +if (evsel->munge_sample != NULL)
> +evsel->munge_sample(evsel, data);
> +
>  return 0;
>  }
> 
> diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> index 501ea6e..4637945 100644
> --- a/tools/perf/util/evsel.h
> +++ b/tools/perf/util/evsel.h
> @@ -12,6 +12,7 @@
>  #include "counts.h"
> 
>  struct perf_evsel;
> +struct 

Re: [RFC 1/4] perf kvm: Enable 'record' on powerpc

2016-04-27 Thread Arnaldo Carvalho de Melo
Em Wed, Apr 27, 2016 at 06:02:21PM +0530, Ravi Bangoria escreveu:
> Hi Arnaldo,
> 
> I've worked on your patch. I'm sending this patch(diff) to check if this
> is the same idea you want to progress with. I cleanup your patch,
> removed arch specific compile time directives and changed code to
> enable cross arch reporting. I tested record on powerpc and report
> on x86 and it's working.
> 
> Please give suggestion about your approach. Let me know if you have
> some other idea to progress with.
> 
> Here is the diff w.r.t perf/cpumode branch:
> 
> diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
> index bff6664..83ef6c6 100644
> --- a/tools/perf/builtin-kvm.c
> +++ b/tools/perf/builtin-kvm.c
> @@ -1480,6 +1480,60 @@ perf_stat:
>  }
>  #endif /* HAVE_KVM_STAT_SUPPORT */
> 
> +#define PPC_HV_DECREMENTER 2432
> +#define PPC_HV_BIT 3
> +#define PPC_PR_BIT 49
> +#define PPC_MAX 63
> +
> +static bool perf_sample__is_hv_dec_trap(struct perf_sample *sample, struct
> perf_evsel *evsel)
> +{
> +int trap = perf_evsel__intval(evsel, sample, "trap");
> +return trap == PPC_HV_DECREMENTER;
> +}
> +
> +static void perf_kvm__munge_ppc_guest_sample(struct perf_evsel *evsel,
> struct perf_sample *sample)
> +{
> +unsigned long msr, hv, pr;
> +
> +if (!perf_sample__is_hv_dec_trap(sample, evsel))
> +return;
> +
> +sample->ip = perf_evsel__intval(evsel, sample, "pc");
> +sample->cpumode = PERF_RECORD_MISC_GUEST_KERNEL;
> +
> +msr = perf_evsel__intval(evsel, sample, "msr");
> +hv = msr & ((unsigned long)1 << (PPC_MAX - PPC_HV_BIT));
> +pr = msr & ((unsigned long)1 << (PPC_MAX - PPC_PR_BIT));
> +if (!hv && pr)
> +sample->cpumode = PERF_RECORD_MISC_GUEST_USER;
> +}
> +
> +static bool perf_evlist__recorded_on_ppc(const struct perf_evlist *evlist)
> +{
> +if (evlist->env && evlist->env->arch) {
> +return !strcmp(evlist->env->arch, "ppc64") ||
> +   !strcmp(evlist->env->arch, "ppc64le");
> +}
> +return false;
> +}
> +
> +int perf_kvm__setup_munge_ppc_guest_event(struct perf_evlist *evlist)
> +{
> +struct perf_evsel *evsel;
> +const char name[] = "kvm_hv:kvm_guest_exit";
> +
> +if (!perf_evlist__recorded_on_ppc(evlist))
> +return 0;
> +
> +evsel = perf_evlist__find_tracepoint_by_name(evlist, name);
> +if (evsel == NULL)
> +return -1;
> +
> +evsel->munge_sample = perf_kvm__munge_ppc_guest_sample;
> +
> +return 0;
> +}
> +
>  static int __cmd_record(const char *file_name, int argc, const char **argv)
>  {
>  int rec_argc, i = 0, j;
> diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> index ab47273..7cb41f7 100644
> --- a/tools/perf/builtin-report.c
> +++ b/tools/perf/builtin-report.c
> @@ -879,6 +879,12 @@ repeat:
>  if (session == NULL)
>  return -1;
> 
> +if (perf_guest &&
> +perf_kvm__setup_munge_ppc_guest_event(session->evlist)) {
> +pr_err("PPC event not present in %s file\n", input_name);
> +goto error;
> +}

This looks out of place, i.e. this reads: "For all cases where there is
a guest and we can't setup the ppc KVM guest related stuff, its an
error"

I think this gets clearer as:

if (perf_guest && perf_evlist__recorded_on_ppc(evlist) &&
perf_kvm__setup_munge_ppc_guest_event(session->evlist)) {
pr_err("PPC event not present in %s file\n", input_name);
goto error;
}

Then we read this as "Hey, if this was recorded on ppc, try to set
things up for ppc", but then again, what is this KVM stuff doing in the
generic 'perf report' code? 

What if this is a perf.data file generated on PPC but being read on PPC?
This will not make sense to munge it, right?

This is with what I remember from this case, please bear with me.

- Arnaldo

> +
>  if (report.queue_size) {
> ordered_events__set_alloc_size(>ordered_events,
> report.queue_size);
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 738ce22..1665171 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -216,6 +216,7 @@ void perf_evsel__init(struct perf_evsel *evsel,
>  evsel->sample_size = __perf_evsel__sample_size(attr->sample_type);
>  perf_evsel__calc_id_pos(evsel);
>  evsel->cmdline_group_boundary = false;
> +evsel->munge_sample = NULL;
>  }
> 
>  struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int
> idx)
> @@ -1887,6 +1888,9 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel,
> union perf_event *event,
>  }
>  }
> 
> +if (evsel->munge_sample != NULL)
> +evsel->munge_sample(evsel, data);
> +
>  return 0;
>  }
> 
> diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> index 501ea6e..4637945 100644
> --- a/tools/perf/util/evsel.h
> +++ b/tools/perf/util/evsel.h
> @@ -12,6 +12,7 @@
>  #include "counts.h"
> 
>  struct perf_evsel;
> +struct 

Re: [PATCH] block: partitions: efi: Always check for alternative GPT at end of drive

2016-04-27 Thread Julius Werner
On Wed, Apr 27, 2016 at 8:09 AM, Karel Zak  wrote:
> On Tue, Apr 26, 2016 at 02:51:01PM -0700, Gwendal Grignou wrote:
>> Julius and I were looking at the code when we spotted the issue.
>>
>> As Julius said, "just pass a boot param", is not easy on certain
>> machines, like phone. It is not user friendly either.
>> The system won't boot at all, a typical user will have to do a full
>> reinstall to fix the error.
>
> And how typical user will fix the problem with corrupted primary
> header after successful boot? I mean, use alternative header (without
> force_gpt) is a good idea if we know that user will not ignore the
> problem. The current solution is to force user to do anything.
>
> It would be nice to have support for this issue in userspace
> and switch for example to single user mode, or so...
>
> I'm also have doubts that printk() is the best way how to report
> this issue to userspace if we want to trigger any action :-)

Holding the whole system hostage and forcing manual action is *not*
user-friendly behavior. Linux is no longer just something for hobbyist
hackers to install on their converted Windows machines at home... it
is a mature, modern operating system kernel used on a wide range of
devices (server farms, phones, embedded systems, etc.) and it should
behave like one. Not all of these platforms necessarily make it easy
for the user to drop into grub and add some command line parameters,
and it's the kernel's job to provide a suitable environment for all of
them so that policy decisions can be left to userspace.

So yes, userspace should resolve this problem, but in order to do that
you need to allow userspace to boot first! dmesg is one suitable way
to communicate the problem, and there are others which I wouldn't be
opposed to either, but no matter which channel we choose the kernel
still has to continue booting to allow the rest of the OS to deal with
it. Whether to ignore, silently repair or fail to boot from a
corrupted primary GPT is a policy decision and it should be made in
user space... if you need to retain the current behavior, it's easy to
add an init script that greps for GPT warnings and hangs to your
distro.


Re: [PATCH] block: partitions: efi: Always check for alternative GPT at end of drive

2016-04-27 Thread Julius Werner
On Wed, Apr 27, 2016 at 8:09 AM, Karel Zak  wrote:
> On Tue, Apr 26, 2016 at 02:51:01PM -0700, Gwendal Grignou wrote:
>> Julius and I were looking at the code when we spotted the issue.
>>
>> As Julius said, "just pass a boot param", is not easy on certain
>> machines, like phone. It is not user friendly either.
>> The system won't boot at all, a typical user will have to do a full
>> reinstall to fix the error.
>
> And how typical user will fix the problem with corrupted primary
> header after successful boot? I mean, use alternative header (without
> force_gpt) is a good idea if we know that user will not ignore the
> problem. The current solution is to force user to do anything.
>
> It would be nice to have support for this issue in userspace
> and switch for example to single user mode, or so...
>
> I'm also have doubts that printk() is the best way how to report
> this issue to userspace if we want to trigger any action :-)

Holding the whole system hostage and forcing manual action is *not*
user-friendly behavior. Linux is no longer just something for hobbyist
hackers to install on their converted Windows machines at home... it
is a mature, modern operating system kernel used on a wide range of
devices (server farms, phones, embedded systems, etc.) and it should
behave like one. Not all of these platforms necessarily make it easy
for the user to drop into grub and add some command line parameters,
and it's the kernel's job to provide a suitable environment for all of
them so that policy decisions can be left to userspace.

So yes, userspace should resolve this problem, but in order to do that
you need to allow userspace to boot first! dmesg is one suitable way
to communicate the problem, and there are others which I wouldn't be
opposed to either, but no matter which channel we choose the kernel
still has to continue booting to allow the rest of the OS to deal with
it. Whether to ignore, silently repair or fail to boot from a
corrupted primary GPT is a policy decision and it should be made in
user space... if you need to retain the current behavior, it's easy to
add an init script that greps for GPT warnings and hangs to your
distro.


Re: [PATCH] i2c-mux-pca9541: fix setup_timer.cocci warnings

2016-04-27 Thread Peter Rosin
On 2016-04-27 21:06, Julia Lawall wrote:
> Use setup_timer function instead of initializing timer with the function
> and data fields
>
> Generated by: scripts/coccinelle/api/setup_timer.cocci
>
> Signed-off-by: Fengguang Wu 
> Signed-off-by: Julia Lawall 
>
Acked-by: Peter Rosin 

Cheers,
Peter



Re: [PATCH] i2c-mux-pca9541: fix setup_timer.cocci warnings

2016-04-27 Thread Peter Rosin
On 2016-04-27 21:06, Julia Lawall wrote:
> Use setup_timer function instead of initializing timer with the function
> and data fields
>
> Generated by: scripts/coccinelle/api/setup_timer.cocci
>
> Signed-off-by: Fengguang Wu 
> Signed-off-by: Julia Lawall 
>
Acked-by: Peter Rosin 

Cheers,
Peter



Re: [PATCH v6 0/7] perf tools: Use SIGUSR2 control data dumpping

2016-04-27 Thread Arnaldo Carvalho de Melo
Em Wed, Apr 20, 2016 at 06:59:47PM +, Wang Nan escreveu:
> v5 -> v6: Improve trigger class: rename (Suggested by Namhyung Kim)
>   toggle -> hit; don't generate functions for each trigger,
> use generic functions instead.
> 
> Patch cleanup: switch auxtrace_snapshot to trigger in a
> isolated patch (2/7).

Applied to my perf/core branch, please check if all is well as I merged
two patches into one to avoid missing synthesized entries in the
bisection history.

- Arnaldo


Re: [PATCH v6 0/7] perf tools: Use SIGUSR2 control data dumpping

2016-04-27 Thread Arnaldo Carvalho de Melo
Em Wed, Apr 20, 2016 at 06:59:47PM +, Wang Nan escreveu:
> v5 -> v6: Improve trigger class: rename (Suggested by Namhyung Kim)
>   toggle -> hit; don't generate functions for each trigger,
> use generic functions instead.
> 
> Patch cleanup: switch auxtrace_snapshot to trigger in a
> isolated patch (2/7).

Applied to my perf/core branch, please check if all is well as I merged
two patches into one to avoid missing synthesized entries in the
bisection history.

- Arnaldo


Re: [PATCH] scsi: fc: force inlining of wwn conversion functions

2016-04-27 Thread Arnd Bergmann
On Wednesday 27 April 2016 13:05:03 Martin Jambor wrote:
> On Tue, Apr 26, 2016 at 05:58:20PM +0200, Arnd Bergmann wrote:
> > On Tuesday 26 April 2016 09:06:54 Martin K. Petersen wrote:
> > > > "Arnd" == Arnd Bergmann  writes:
> > > 
> > > Arnd> I don't think we can realistically blacklist gcc-4.9.{0,1,2,3},
> > > Arnd> gcc-5.{0,1,2,3}.* and gcc-6.0 and require everyone to upgrade to
> > > Arnd> compilers that have not been released yet in order to build a
> > > Arnd> linux-4.6 kernel.
> > > 
> > > I agree that compiler blacklisting is problematic and I'd like to avoid
> > > it. The question is how far we go in the kernel to accommodate various
> > > levels of brokenness.
> > > 
> > > In any case. Sticking compiler workarounds in device driver code is akin
> > > to putting demolition orders on display on Alpha Centauri. At the very
> > > minimum the patch should put a fat comment in the code stating that
> > > these wrapper functions or #defines should not be changed in the future
> > > because that'll break builds using gcc XYZ. But that does not solve the
> > > problem for anybody else that might be doing something similar.
> > > Converting between u64 and $RANDOM_TYPE in an inline wrapper does not
> > > seem like a rare and unusual programming pattern.
> > 
> > It's not the driver really, it's the core scsi/fc layer, which makes
> > it a little dangerous that a random driver.
> > 
> > I agree that putting a comment in would also help. What I understand
> > from the bug report is that to trigger this bug you need these elements:
> > 
> > 1. an inline function marked __always_inline
> > 2. another inline function that is automatically inlined (not 
> > __always_inline)
> > 3. CONFIG_OPTIMIZE_INLINING=y to guarantee 2
> > 4. __builtin_compatible_p inside that inline function
> 
> The __always_inline requirement is not true.  In fact, if you look at
> the example testcase filed in
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70646#c7 you'll see it
> uses __builtin_compatible_p in an __always inline function that is
> called from one that is not tagged with that attribute.
>
> And generally speaking, always inline is never a requirement, any call
> or chain of calls that the inliner can decide to inline can lead to
> the bug (if it complies with the condition below).

Ok, thanks for the clarification, I thought you always had to have both
kinds of inline functions.
 
> What is a requirement, though, is that __builtin_compatible_p is
> called on something passed in an argument by reference or in an
> aggregate (i.e. struct or array) argument.
> 
> So,
> 
>   int foo1 (unsigned long *ref)
>   {
> if (__builtin_constant (*ref))
>   ...
> else
>   /* wrongly unreachable code */
>   }
> 
>   }
> 
> cannot, and is fine.  But please note that wrapping a foo[12]-like
> function into a dereferencing wrapper might not help if foo[12] would
> be early-inlined into such wrapper (GCC has two inliners, a very
> simple early-inliner that only handles simple cases and a full-blown
> IPA inliner that contains the bug).  I believe this can be ensured by
> making the wrapper always_inline and never calling it indirectly (via
> a pointer).  Honza (CCed), you know inlining heuristics better, please
> correct me if my last statement is somehow inaccurate (or indeed if
> you have a better idea how kernel developers can make sure they do not
> hit the bug).

I guess that means that any user of this code in the kernel:

static inline __attribute_const__ __u64 __fswab64(__u64 val)
{
#ifdef __HAVE_BUILTIN_BSWAP64__
return __builtin_bswap64(val);
#elif defined (__arch_swab64)
return __arch_swab64(val);
#elif defined(__SWAB_64_THRU_32__)
__u32 h = val >> 32;
__u32 l = val & ((1ULL << 32) - 1);
return (((__u64)__fswab32(l)) << 32) | ((__u64)(__fswab32(h)));
#else
return ___constant_swab64(val);
#endif
}

#define __swab64(x) \
(__builtin_constant_p((__u64)(x)) ? \
___constant_swab64(x) : \
__fswab64(x))

static __always_inline __u64 __swab64p(const __u64 *p)
{   
#ifdef __arch_swab64p
return __arch_swab64p(p);
#else
return __swab64(*p);
#endif
}

has a chance of running into the same problem, and we may want to solve
it at the root. For architectures that define __HAVE_BUILTIN_BSWAP64__
(i.e. ARM, MIPS, POWERPC, S390, and x86 with gcc-4.4 or higher, 4.8
for __HAVE_BUILTIN_BSWAP16__), we can probably just change the logic
to avoid __builtin_constant_p() and always use __builtin_bswap64().

This won't help on TILE, which is the one architecture that sets
ARCH_SUPPORTS_OPTIMIZED_INLINING but does not set ARCH_USE_BUILTIN_BSWAP.
Chris Metcalf should be able to figure out whether we can just
set ARCH_USE_BUILTIN_BSWAP for tile as well.

Arnd


Re: [PATCH] scsi: fc: force inlining of wwn conversion functions

2016-04-27 Thread Arnd Bergmann
On Wednesday 27 April 2016 13:05:03 Martin Jambor wrote:
> On Tue, Apr 26, 2016 at 05:58:20PM +0200, Arnd Bergmann wrote:
> > On Tuesday 26 April 2016 09:06:54 Martin K. Petersen wrote:
> > > > "Arnd" == Arnd Bergmann  writes:
> > > 
> > > Arnd> I don't think we can realistically blacklist gcc-4.9.{0,1,2,3},
> > > Arnd> gcc-5.{0,1,2,3}.* and gcc-6.0 and require everyone to upgrade to
> > > Arnd> compilers that have not been released yet in order to build a
> > > Arnd> linux-4.6 kernel.
> > > 
> > > I agree that compiler blacklisting is problematic and I'd like to avoid
> > > it. The question is how far we go in the kernel to accommodate various
> > > levels of brokenness.
> > > 
> > > In any case. Sticking compiler workarounds in device driver code is akin
> > > to putting demolition orders on display on Alpha Centauri. At the very
> > > minimum the patch should put a fat comment in the code stating that
> > > these wrapper functions or #defines should not be changed in the future
> > > because that'll break builds using gcc XYZ. But that does not solve the
> > > problem for anybody else that might be doing something similar.
> > > Converting between u64 and $RANDOM_TYPE in an inline wrapper does not
> > > seem like a rare and unusual programming pattern.
> > 
> > It's not the driver really, it's the core scsi/fc layer, which makes
> > it a little dangerous that a random driver.
> > 
> > I agree that putting a comment in would also help. What I understand
> > from the bug report is that to trigger this bug you need these elements:
> > 
> > 1. an inline function marked __always_inline
> > 2. another inline function that is automatically inlined (not 
> > __always_inline)
> > 3. CONFIG_OPTIMIZE_INLINING=y to guarantee 2
> > 4. __builtin_compatible_p inside that inline function
> 
> The __always_inline requirement is not true.  In fact, if you look at
> the example testcase filed in
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70646#c7 you'll see it
> uses __builtin_compatible_p in an __always inline function that is
> called from one that is not tagged with that attribute.
>
> And generally speaking, always inline is never a requirement, any call
> or chain of calls that the inliner can decide to inline can lead to
> the bug (if it complies with the condition below).

Ok, thanks for the clarification, I thought you always had to have both
kinds of inline functions.
 
> What is a requirement, though, is that __builtin_compatible_p is
> called on something passed in an argument by reference or in an
> aggregate (i.e. struct or array) argument.
> 
> So,
> 
>   int foo1 (unsigned long *ref)
>   {
> if (__builtin_constant (*ref))
>   ...
> else
>   /* wrongly unreachable code */
>   }
> 
>   }
> 
> cannot, and is fine.  But please note that wrapping a foo[12]-like
> function into a dereferencing wrapper might not help if foo[12] would
> be early-inlined into such wrapper (GCC has two inliners, a very
> simple early-inliner that only handles simple cases and a full-blown
> IPA inliner that contains the bug).  I believe this can be ensured by
> making the wrapper always_inline and never calling it indirectly (via
> a pointer).  Honza (CCed), you know inlining heuristics better, please
> correct me if my last statement is somehow inaccurate (or indeed if
> you have a better idea how kernel developers can make sure they do not
> hit the bug).

I guess that means that any user of this code in the kernel:

static inline __attribute_const__ __u64 __fswab64(__u64 val)
{
#ifdef __HAVE_BUILTIN_BSWAP64__
return __builtin_bswap64(val);
#elif defined (__arch_swab64)
return __arch_swab64(val);
#elif defined(__SWAB_64_THRU_32__)
__u32 h = val >> 32;
__u32 l = val & ((1ULL << 32) - 1);
return (((__u64)__fswab32(l)) << 32) | ((__u64)(__fswab32(h)));
#else
return ___constant_swab64(val);
#endif
}

#define __swab64(x) \
(__builtin_constant_p((__u64)(x)) ? \
___constant_swab64(x) : \
__fswab64(x))

static __always_inline __u64 __swab64p(const __u64 *p)
{   
#ifdef __arch_swab64p
return __arch_swab64p(p);
#else
return __swab64(*p);
#endif
}

has a chance of running into the same problem, and we may want to solve
it at the root. For architectures that define __HAVE_BUILTIN_BSWAP64__
(i.e. ARM, MIPS, POWERPC, S390, and x86 with gcc-4.4 or higher, 4.8
for __HAVE_BUILTIN_BSWAP16__), we can probably just change the logic
to avoid __builtin_constant_p() and always use __builtin_bswap64().

This won't help on TILE, which is the one architecture that sets
ARCH_SUPPORTS_OPTIMIZED_INLINING but does not set ARCH_USE_BUILTIN_BSWAP.
Chris Metcalf should be able to figure out whether we can just
set ARCH_USE_BUILTIN_BSWAP for tile as well.

Arnd


Re: [PATCH v6 3/7] perf record: Split output into multiple files via '--switch-output'

2016-04-27 Thread Arnaldo Carvalho de Melo
Em Wed, Apr 20, 2016 at 06:59:50PM +, Wang Nan escreveu:
> Allow 'perf record' to split its output into multiple files.
> 
> For example:

I squashed:

->  360   T 04/20 Wang Nan(1.7K) ├─>[PATCH v6 6/7]
perf record: Re-synthesize tracking events after output switching

Into this patch, so that we don't have the problem in the bisection
history where samples don't get resolved to the existing threads not
synthesized in the perf.data.N where N > the first timestamp.

Please holler if you disagree, I doubt you will tho :-)

- Arnaldo

 
>   # ~/perf record -a --timestamp-filename --switch-output &
>   [1] 10763
>   # kill -s SIGUSR2 10763
>   [ perf record: dump data: Woken up 1 times ]
>   # [ perf record: Dump perf.data.2015122622314468 ]
> 
>   # kill -s SIGUSR2 10763
>   [ perf record: dump data: Woken up 1 times ]
>   # [ perf record: Dump perf.data.2015122622314762 ]
> 
>   # kill -s SIGUSR2 10763
>   [ perf record: dump data: Woken up 1 times ]
>   #[ perf record: Dump perf.data.2015122622315171 ]
> 
>   # fg
>   perf record -a --timestamp-filename --switch-output
>   ^C[ perf record: Woken up 1 times to write data ]
>   [ perf record: Dump perf.data.2015122622315513 ]
>   [ perf record: Captured and wrote 0.014 MB perf.data. (296 
> samples) ]
> 
>   # ls -l
>   total 920
>   -rw--- 1 root root 797692 Dec 26 22:31 perf.data.2015122622314468
>   -rw--- 1 root root  59960 Dec 26 22:31 perf.data.2015122622314762
>   -rw--- 1 root root  59912 Dec 26 22:31 perf.data.2015122622315171
>   -rw--- 1 root root  19220 Dec 26 22:31 perf.data.2015122622315513
> 
> Signed-off-by: Wang Nan 
> Tested-by: Arnaldo Carvalho de Melo 
> Cc: Adrian Hunter 
> Cc: Jiri Olsa 
> Cc: Masami Hiramatsu 
> Cc: Namhyung Kim 
> Cc: Zefan Li 
> Cc: pi3or...@163.com
> Link: 
> http://lkml.kernel.org/r/1460643725-167413-3-git-send-email-wangn...@huawei.com
> Signed-off-by: He Kuang 
> [ Added man page entry ]
> Signed-off-by: Arnaldo Carvalho de Melo 
> ---
>  tools/perf/Documentation/perf-record.txt |  8 
>  tools/perf/builtin-record.c  | 33 
> ++--
>  2 files changed, 39 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/perf/Documentation/perf-record.txt 
> b/tools/perf/Documentation/perf-record.txt
> index 19aa175..a77a431 100644
> --- a/tools/perf/Documentation/perf-record.txt
> +++ b/tools/perf/Documentation/perf-record.txt
> @@ -347,6 +347,14 @@ Configure all used events to run in kernel space.
>  --all-user::
>  Configure all used events to run in user space.
>  
> +--switch-output::
> +Generate multiple perf.data files, timestamp prefixed, switching to a new one
> +when receiving a SIGUSR2.
> +
> +A possible use case is to, given an external event, slice the perf.data file
> +that gets then processed, possibly via a perf script, to decide if that
> +particular perf.data snapshot should be kept or not.
> +
>  SEE ALSO
>  
>  linkperf:perf-stat[1], linkperf:perf-list[1]
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index f4710c8..72246e2 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -58,6 +58,7 @@ struct record {
>   boolno_buildid_cache_set;
>   boolbuildid_all;
>   booltimestamp_filename;
> + boolswitch_output;
>   unsigned long long  samples;
>  };
>  
> @@ -130,6 +131,7 @@ static volatile int child_finished;
>  
>  static volatile int auxtrace_record__snapshot_started;
>  static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
> +static DEFINE_TRIGGER(switch_output_trigger);
>  
>  static void sig_handler(int sig)
>  {
> @@ -650,9 +652,12 @@ static int __cmd_record(struct record *rec, int argc, 
> const char **argv)
>   signal(SIGINT, sig_handler);
>   signal(SIGTERM, sig_handler);
>  
> - if (rec->opts.auxtrace_snapshot_mode) {
> + if (rec->opts.auxtrace_snapshot_mode || rec->switch_output) {
>   signal(SIGUSR2, snapshot_sig_handler);
> - trigger_on(_snapshot_trigger);
> + if (rec->opts.auxtrace_snapshot_mode)
> + trigger_on(_snapshot_trigger);
> + if (rec->switch_output)
> + trigger_on(_output_trigger);
>   } else {
>   signal(SIGUSR2, SIG_IGN);
>   }
> @@ -782,11 +787,13 @@ static int __cmd_record(struct record *rec, int argc, 
> const char **argv)
>   }
>  
>   trigger_ready(_snapshot_trigger);
> + trigger_ready(_output_trigger);
>   for (;;) {
>   unsigned long long hits = rec->samples;
>  
>   if (record__mmap_read_all(rec) < 0) {
>   trigger_error(_snapshot_trigger);
> + 

Re: [PATCH V2] cpuidle: Change ktime_get() with local_clock()

2016-04-27 Thread Rafael J. Wysocki
On Friday, April 22, 2016 08:42:40 AM Peter Zijlstra wrote:
> On Thu, Apr 21, 2016 at 09:41:14PM +0200, Rafael J. Wysocki wrote:
> > On Thu, Apr 21, 2016 at 10:56 AM, Daniel Lezcano
> >  wrote:
> > > The ktime_get() can have a non negligeable overhead, use local_clock()
> > > instead.
> > >
> > > In order to test the difference between ktime_get() and local_clock(),
> > > a quick hack has been added to trigger, via debugfs, 1 times a
> > > call to ktime_get() and local_clock() and measure the elapsed time.
> > >
> > > Then the average value, the min and max is computed for each call.
> > >
> > > From userspace, the test above was called 100 times every 2 seconds.
> > >
> > > So, ktime_get() and local_clock() have been called 100 times in
> > > total.
> > >
> > > The results are:
> > >
> > > ktime_get():
> > > 
> > >  * average: 101 ns (stddev: 27.4)
> > >  * maximum: 38313 ns
> > >  * minimum: 65 ns
> > >
> > > local_clock():
> > > ==
> > >  * average: 60 ns (stddev: 9.8)
> > >  * maximum: 13487 ns
> > >  * minimum: 46 ns
> > >
> > > The local_clock() is faster and more stable.
> > >
> > > Even if it is a drop in the ocean, changing the ktime_get() by the
> > > local_clock() allows to save 80ns at idle time (entry + exit). And
> > > in some circumstances, especially when there are several CPUs racing
> > > for the clock access, we save tens of microseconds.
> > >
> > > The idle duration resulting from a diff is converted from nanosec to
> > > microsec. This could be done with integer division (div 1000) - which is
> > > an expensive operation or by 10 bits shifting (div 1024) - which is fast
> > > but unprecise.
> > >
> > > The following table gives some results at the limits.
> > >
> > >  --
> > > |   nsec   |   div(1000)   |   div(1024)   |
> > >  --
> > > |   1e3|1 usec |  976 nsec |
> > >  --
> > > |   1e6| 1000 usec |  976 usec |
> > >  --
> > > |   1e9|  100 usec |   976562 usec |
> > >  --
> > >
> > > There is a linear deviation of 2.34%. This loss of precision is acceptable
> > > in the context of the resulting diff which is used for statistics. These
> > > ones are processed to guess estimate an approximation of the duration of 
> > > the
> > > next idle period which ends up into an idle state selection. The selection
> > > criteria takes into account the next duration based on large intervals,
> > > represented by the idle state's target residency.
> > >
> > > The 2^10 division is enough because the approximation regarding the 1e3
> > > division is lost in all the approximations done for the next idle duration
> > > computation.
> > >
> > > Signed-off-by: Daniel Lezcano 
> > 
> > Looks good to me.
> > 
> > Peter, are you happy with the changelog now?
> 
> Yep, works for me:
> 
> Acked-by: Peter Zijlstra (Intel) 

OK, applied.  Thanks!



Re: [PATCH v6 3/7] perf record: Split output into multiple files via '--switch-output'

2016-04-27 Thread Arnaldo Carvalho de Melo
Em Wed, Apr 20, 2016 at 06:59:50PM +, Wang Nan escreveu:
> Allow 'perf record' to split its output into multiple files.
> 
> For example:

I squashed:

->  360   T 04/20 Wang Nan(1.7K) ├─>[PATCH v6 6/7]
perf record: Re-synthesize tracking events after output switching

Into this patch, so that we don't have the problem in the bisection
history where samples don't get resolved to the existing threads not
synthesized in the perf.data.N where N > the first timestamp.

Please holler if you disagree, I doubt you will tho :-)

- Arnaldo

 
>   # ~/perf record -a --timestamp-filename --switch-output &
>   [1] 10763
>   # kill -s SIGUSR2 10763
>   [ perf record: dump data: Woken up 1 times ]
>   # [ perf record: Dump perf.data.2015122622314468 ]
> 
>   # kill -s SIGUSR2 10763
>   [ perf record: dump data: Woken up 1 times ]
>   # [ perf record: Dump perf.data.2015122622314762 ]
> 
>   # kill -s SIGUSR2 10763
>   [ perf record: dump data: Woken up 1 times ]
>   #[ perf record: Dump perf.data.2015122622315171 ]
> 
>   # fg
>   perf record -a --timestamp-filename --switch-output
>   ^C[ perf record: Woken up 1 times to write data ]
>   [ perf record: Dump perf.data.2015122622315513 ]
>   [ perf record: Captured and wrote 0.014 MB perf.data. (296 
> samples) ]
> 
>   # ls -l
>   total 920
>   -rw--- 1 root root 797692 Dec 26 22:31 perf.data.2015122622314468
>   -rw--- 1 root root  59960 Dec 26 22:31 perf.data.2015122622314762
>   -rw--- 1 root root  59912 Dec 26 22:31 perf.data.2015122622315171
>   -rw--- 1 root root  19220 Dec 26 22:31 perf.data.2015122622315513
> 
> Signed-off-by: Wang Nan 
> Tested-by: Arnaldo Carvalho de Melo 
> Cc: Adrian Hunter 
> Cc: Jiri Olsa 
> Cc: Masami Hiramatsu 
> Cc: Namhyung Kim 
> Cc: Zefan Li 
> Cc: pi3or...@163.com
> Link: 
> http://lkml.kernel.org/r/1460643725-167413-3-git-send-email-wangn...@huawei.com
> Signed-off-by: He Kuang 
> [ Added man page entry ]
> Signed-off-by: Arnaldo Carvalho de Melo 
> ---
>  tools/perf/Documentation/perf-record.txt |  8 
>  tools/perf/builtin-record.c  | 33 
> ++--
>  2 files changed, 39 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/perf/Documentation/perf-record.txt 
> b/tools/perf/Documentation/perf-record.txt
> index 19aa175..a77a431 100644
> --- a/tools/perf/Documentation/perf-record.txt
> +++ b/tools/perf/Documentation/perf-record.txt
> @@ -347,6 +347,14 @@ Configure all used events to run in kernel space.
>  --all-user::
>  Configure all used events to run in user space.
>  
> +--switch-output::
> +Generate multiple perf.data files, timestamp prefixed, switching to a new one
> +when receiving a SIGUSR2.
> +
> +A possible use case is to, given an external event, slice the perf.data file
> +that gets then processed, possibly via a perf script, to decide if that
> +particular perf.data snapshot should be kept or not.
> +
>  SEE ALSO
>  
>  linkperf:perf-stat[1], linkperf:perf-list[1]
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index f4710c8..72246e2 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -58,6 +58,7 @@ struct record {
>   boolno_buildid_cache_set;
>   boolbuildid_all;
>   booltimestamp_filename;
> + boolswitch_output;
>   unsigned long long  samples;
>  };
>  
> @@ -130,6 +131,7 @@ static volatile int child_finished;
>  
>  static volatile int auxtrace_record__snapshot_started;
>  static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
> +static DEFINE_TRIGGER(switch_output_trigger);
>  
>  static void sig_handler(int sig)
>  {
> @@ -650,9 +652,12 @@ static int __cmd_record(struct record *rec, int argc, 
> const char **argv)
>   signal(SIGINT, sig_handler);
>   signal(SIGTERM, sig_handler);
>  
> - if (rec->opts.auxtrace_snapshot_mode) {
> + if (rec->opts.auxtrace_snapshot_mode || rec->switch_output) {
>   signal(SIGUSR2, snapshot_sig_handler);
> - trigger_on(_snapshot_trigger);
> + if (rec->opts.auxtrace_snapshot_mode)
> + trigger_on(_snapshot_trigger);
> + if (rec->switch_output)
> + trigger_on(_output_trigger);
>   } else {
>   signal(SIGUSR2, SIG_IGN);
>   }
> @@ -782,11 +787,13 @@ static int __cmd_record(struct record *rec, int argc, 
> const char **argv)
>   }
>  
>   trigger_ready(_snapshot_trigger);
> + trigger_ready(_output_trigger);
>   for (;;) {
>   unsigned long long hits = rec->samples;
>  
>   if (record__mmap_read_all(rec) < 0) {
>   trigger_error(_snapshot_trigger);
> + trigger_error(_output_trigger);
>   err = -1;
>   goto out_child;
>   }
> @@ -802,6 +809,22 @@ static int __cmd_record(struct record *rec, int argc, 

Re: [PATCH V2] cpuidle: Change ktime_get() with local_clock()

2016-04-27 Thread Rafael J. Wysocki
On Friday, April 22, 2016 08:42:40 AM Peter Zijlstra wrote:
> On Thu, Apr 21, 2016 at 09:41:14PM +0200, Rafael J. Wysocki wrote:
> > On Thu, Apr 21, 2016 at 10:56 AM, Daniel Lezcano
> >  wrote:
> > > The ktime_get() can have a non negligeable overhead, use local_clock()
> > > instead.
> > >
> > > In order to test the difference between ktime_get() and local_clock(),
> > > a quick hack has been added to trigger, via debugfs, 1 times a
> > > call to ktime_get() and local_clock() and measure the elapsed time.
> > >
> > > Then the average value, the min and max is computed for each call.
> > >
> > > From userspace, the test above was called 100 times every 2 seconds.
> > >
> > > So, ktime_get() and local_clock() have been called 100 times in
> > > total.
> > >
> > > The results are:
> > >
> > > ktime_get():
> > > 
> > >  * average: 101 ns (stddev: 27.4)
> > >  * maximum: 38313 ns
> > >  * minimum: 65 ns
> > >
> > > local_clock():
> > > ==
> > >  * average: 60 ns (stddev: 9.8)
> > >  * maximum: 13487 ns
> > >  * minimum: 46 ns
> > >
> > > The local_clock() is faster and more stable.
> > >
> > > Even if it is a drop in the ocean, changing the ktime_get() by the
> > > local_clock() allows to save 80ns at idle time (entry + exit). And
> > > in some circumstances, especially when there are several CPUs racing
> > > for the clock access, we save tens of microseconds.
> > >
> > > The idle duration resulting from a diff is converted from nanosec to
> > > microsec. This could be done with integer division (div 1000) - which is
> > > an expensive operation or by 10 bits shifting (div 1024) - which is fast
> > > but unprecise.
> > >
> > > The following table gives some results at the limits.
> > >
> > >  --
> > > |   nsec   |   div(1000)   |   div(1024)   |
> > >  --
> > > |   1e3|1 usec |  976 nsec |
> > >  --
> > > |   1e6| 1000 usec |  976 usec |
> > >  --
> > > |   1e9|  100 usec |   976562 usec |
> > >  --
> > >
> > > There is a linear deviation of 2.34%. This loss of precision is acceptable
> > > in the context of the resulting diff which is used for statistics. These
> > > ones are processed to guess estimate an approximation of the duration of 
> > > the
> > > next idle period which ends up into an idle state selection. The selection
> > > criteria takes into account the next duration based on large intervals,
> > > represented by the idle state's target residency.
> > >
> > > The 2^10 division is enough because the approximation regarding the 1e3
> > > division is lost in all the approximations done for the next idle duration
> > > computation.
> > >
> > > Signed-off-by: Daniel Lezcano 
> > 
> > Looks good to me.
> > 
> > Peter, are you happy with the changelog now?
> 
> Yep, works for me:
> 
> Acked-by: Peter Zijlstra (Intel) 

OK, applied.  Thanks!



Re: [PATCH] PM / clk: ensure we don't allocate a -ve size of count clks

2016-04-27 Thread Rafael J. Wysocki
On Saturday, April 16, 2016 01:50:03 PM Colin King wrote:
> From: Colin Ian King 
> 
> It is entirely possible for of_count_phandle_wit_args to
> return a -ve error return value so we need to check for this
> otherwise we end up allocating a negative number of clk objects.
> 
> Signed-off-by: Colin Ian King 

Applied, thanks!



Re: [PATCH] PM / clk: ensure we don't allocate a -ve size of count clks

2016-04-27 Thread Rafael J. Wysocki
On Saturday, April 16, 2016 01:50:03 PM Colin King wrote:
> From: Colin Ian King 
> 
> It is entirely possible for of_count_phandle_wit_args to
> return a -ve error return value so we need to check for this
> otherwise we end up allocating a negative number of clk objects.
> 
> Signed-off-by: Colin Ian King 

Applied, thanks!



Re: [PATCH] powercap/intel_rapl: Add support for Kabylake

2016-04-27 Thread Rafael J. Wysocki
On Monday, April 25, 2016 08:36:47 AM Jacob Pan wrote:
> On Mon, 25 Apr 2016 16:20:17 +0200
> "Rafael J. Wysocki"  wrote:
> 
> > How urgent is this?
> > 
> > Can we live without it in 4.6 in particular?
> not urgent.

OK

Queued up for 4.7, thanks!



Re: [PATCH v4 4/4] ARM64: dts: rockchip: add dts file for RK3399 evaluation board

2016-04-27 Thread Heiko Stübner
Am Mittwoch, 27. April 2016, 15:54:53 schrieb Jianqun Xu:
> This patch add rk3399-evb.dts for RK3399 evaluation board.
> Tested on RK3399 evb.
> 
> Signed-off-by: Jianqun Xu 

applied to my dts64-branch for 4.7

Thanks
Heiko


Re: [PATCH] powercap/intel_rapl: Add support for Kabylake

2016-04-27 Thread Rafael J. Wysocki
On Monday, April 25, 2016 08:36:47 AM Jacob Pan wrote:
> On Mon, 25 Apr 2016 16:20:17 +0200
> "Rafael J. Wysocki"  wrote:
> 
> > How urgent is this?
> > 
> > Can we live without it in 4.6 in particular?
> not urgent.

OK

Queued up for 4.7, thanks!



Re: [PATCH v4 4/4] ARM64: dts: rockchip: add dts file for RK3399 evaluation board

2016-04-27 Thread Heiko Stübner
Am Mittwoch, 27. April 2016, 15:54:53 schrieb Jianqun Xu:
> This patch add rk3399-evb.dts for RK3399 evaluation board.
> Tested on RK3399 evb.
> 
> Signed-off-by: Jianqun Xu 

applied to my dts64-branch for 4.7

Thanks
Heiko


Re: [PATCH v2 3/3] ARM: dts: exynos: Lower SD card interface voltage to 2.8v on Odroid X/X2/U3

2016-04-27 Thread Javier Martinez Canillas
Hello 

On 04/27/2016 10:00 AM, Krzysztof Kozlowski wrote:
> Odroid X/X2/U3 schematics say that SD card vmmc regulator
> (LDO21/TFLASH) operates on 2.8 V. Mainline U-Boot uses that value as
> well. 2.8 V is common on Exynos-based boards. Additionally use some
> descriptive name for this regulator.
> 
> Signed-off-by: Krzysztof Kozlowski 
> 

Reviewed-by: Javier Martinez Canillas 

Best regards,
-- 
Javier Martinez Canillas
Open Source Group
Samsung Research America


Re: [PATCH v2 3/3] ARM: dts: exynos: Lower SD card interface voltage to 2.8v on Odroid X/X2/U3

2016-04-27 Thread Javier Martinez Canillas
Hello 

On 04/27/2016 10:00 AM, Krzysztof Kozlowski wrote:
> Odroid X/X2/U3 schematics say that SD card vmmc regulator
> (LDO21/TFLASH) operates on 2.8 V. Mainline U-Boot uses that value as
> well. 2.8 V is common on Exynos-based boards. Additionally use some
> descriptive name for this regulator.
> 
> Signed-off-by: Krzysztof Kozlowski 
> 

Reviewed-by: Javier Martinez Canillas 

Best regards,
-- 
Javier Martinez Canillas
Open Source Group
Samsung Research America


Re: [PATCH v4 3/4] Documentation: devicetree: rockchip: Document rk3399-evb

2016-04-27 Thread Heiko Stübner
Am Mittwoch, 27. April 2016, 15:54:52 schrieb Jianqun Xu:
> Use "rockchip,rk3399-evb" compatible string for Rockchip RK3399
> evaluation board.
> 
> Acked-by: Rob Herring 
> Signed-off-by: Jianqun Xu 

applied to my dts64-branch for 4.7

Thanks
Heiko


Re: [PATCH v4 3/4] Documentation: devicetree: rockchip: Document rk3399-evb

2016-04-27 Thread Heiko Stübner
Am Mittwoch, 27. April 2016, 15:54:52 schrieb Jianqun Xu:
> Use "rockchip,rk3399-evb" compatible string for Rockchip RK3399
> evaluation board.
> 
> Acked-by: Rob Herring 
> Signed-off-by: Jianqun Xu 

applied to my dts64-branch for 4.7

Thanks
Heiko


Re: [PATCH v4 1/4] Documentation: rockchip-dw-mshc: add description for rk3399

2016-04-27 Thread Heiko Stübner
Am Mittwoch, 27. April 2016, 15:54:50 schrieb Jianqun Xu:
> From: Shawn Lin 
> 
> Add "rockchip,rk3399-dw-mshc", "rockchip,rk3288-dw-mshc" for
> dwmmc on rk3399 platform.
> 
> Acked-by: Rob Herring 
> Signed-off-by: Shawn Lin 
> Signed-off-by: Jianqun Xu 

applied to my dts64-branch for 4.7

Thanks
Heiko


Re: [PATCH v4 2/4] ARM64: dts: rockchip: add core dtsi file for RK3399 SoCs

2016-04-27 Thread Heiko Stübner
Am Mittwoch, 27. April 2016, 15:54:51 schrieb Jianqun Xu:
> This patch adds core dtsi file for Rockchip RK3399 SoCs.
> 
> The RK3399 has big/little architecture, which needs a separate
> node for the PMU of each microarchitecture, for now it missing
> the pmu node since the old one could not work well.
> 
> Tested-by: Brian Norris 
> Signed-off-by: Jianqun Xu 

with some minor reorganizations, applied to my dts64 branch for 4.7


Thanks
Heiko


Re: [PATCH v4 1/4] Documentation: rockchip-dw-mshc: add description for rk3399

2016-04-27 Thread Heiko Stübner
Am Mittwoch, 27. April 2016, 15:54:50 schrieb Jianqun Xu:
> From: Shawn Lin 
> 
> Add "rockchip,rk3399-dw-mshc", "rockchip,rk3288-dw-mshc" for
> dwmmc on rk3399 platform.
> 
> Acked-by: Rob Herring 
> Signed-off-by: Shawn Lin 
> Signed-off-by: Jianqun Xu 

applied to my dts64-branch for 4.7

Thanks
Heiko


Re: [PATCH v4 2/4] ARM64: dts: rockchip: add core dtsi file for RK3399 SoCs

2016-04-27 Thread Heiko Stübner
Am Mittwoch, 27. April 2016, 15:54:51 schrieb Jianqun Xu:
> This patch adds core dtsi file for Rockchip RK3399 SoCs.
> 
> The RK3399 has big/little architecture, which needs a separate
> node for the PMU of each microarchitecture, for now it missing
> the pmu node since the old one could not work well.
> 
> Tested-by: Brian Norris 
> Signed-off-by: Jianqun Xu 

with some minor reorganizations, applied to my dts64 branch for 4.7


Thanks
Heiko


Re: [PATCH] ASoC: atmel_ssc_dai: note buggy I2S support when the codec masters LRCLK

2016-04-27 Thread Peter Rosin
On 2016-04-27 18:23, Mark Brown wrote:
> On Wed, Apr 27, 2016 at 11:06:33AM +0200, Peter Rosin wrote:
>
>> While the start condition is correct for the left channel word in the I2S
>> case, it is not correct that the right channel word follows immediately
>> after the left channel word. The start of the right channel word should
>> be triggered by a rising edge on LRCLK in the I2S case, something which
>> simply does not happen.
>
> Almost every programmable serial port does this, it's a very common
> issue which is why we always try to go for exact clocking where we can -
> it greatly improves the interoperability for I2S if there are no dead
> clocks.

Someone said: be conservative in what you send, be liberal in what you
accept. This is not that at all. This is more like: be conservative in
what you send, and accept only a subset of valid input.

It absolutely kills interoperability if you claim I2S and then don't
do I2S. If you are not looking at both edges of LRCLK it simply isn't I2S.
It's something else. Like "packed I2S" or "DSP moda A with inverted
symmetric LRCLK" or something, so why not invent a way to say that?

Dais could be taught that an I2S LRCLK slave is compatible with this
new mode as LRCLK master, but not the other way around. But since most
people are not making dai link decisions, that will probably be a lot
of work for questionable gain. What is needed is documenatation about
quirks such as this, hence my patch.

I have this codec which does I2S but there is no way to get rid of dead
clocks when it masters the clocks. It can divide MCLK with 1,2,4,8 or 16
to get a BCLK, or it can generate BCLK as 48x or 64x LRCLK. But it only
sports 16 bits per sample.

So, if it divides MCLK, and MCLK is not matched to the needed LRCLK (there
is no need, we currently use a 16MHz MCLK) there will almost certainly
be some dead BCLKs, and if LRCLK is used as base for BCLK, there will be
either 8 or 16 dead cycles per channel. In other words, it is difficult
or next to impossible to not get dead cycles with this codec. But it
does I2S correctly.

When you don't know beforehand that the clock slave (Atmel SSC) is lying
and isn't supporting I2S even though it claims so, it turned out to be a
huge time sink for me when I tried to bring up this new codec (max9860).
I naturally thought that the problem was with my new code and not the
old (assumed mature and mostly free of silly bugs) code. Especially for
something as well understood as I2S. I was baffled when I realized what
the problem was, it wasn't even on my map. I'm probably naive...

Cheers,
Peter



Re: [PATCH] ASoC: atmel_ssc_dai: note buggy I2S support when the codec masters LRCLK

2016-04-27 Thread Peter Rosin
On 2016-04-27 18:23, Mark Brown wrote:
> On Wed, Apr 27, 2016 at 11:06:33AM +0200, Peter Rosin wrote:
>
>> While the start condition is correct for the left channel word in the I2S
>> case, it is not correct that the right channel word follows immediately
>> after the left channel word. The start of the right channel word should
>> be triggered by a rising edge on LRCLK in the I2S case, something which
>> simply does not happen.
>
> Almost every programmable serial port does this, it's a very common
> issue which is why we always try to go for exact clocking where we can -
> it greatly improves the interoperability for I2S if there are no dead
> clocks.

Someone said: be conservative in what you send, be liberal in what you
accept. This is not that at all. This is more like: be conservative in
what you send, and accept only a subset of valid input.

It absolutely kills interoperability if you claim I2S and then don't
do I2S. If you are not looking at both edges of LRCLK it simply isn't I2S.
It's something else. Like "packed I2S" or "DSP moda A with inverted
symmetric LRCLK" or something, so why not invent a way to say that?

Dais could be taught that an I2S LRCLK slave is compatible with this
new mode as LRCLK master, but not the other way around. But since most
people are not making dai link decisions, that will probably be a lot
of work for questionable gain. What is needed is documenatation about
quirks such as this, hence my patch.

I have this codec which does I2S but there is no way to get rid of dead
clocks when it masters the clocks. It can divide MCLK with 1,2,4,8 or 16
to get a BCLK, or it can generate BCLK as 48x or 64x LRCLK. But it only
sports 16 bits per sample.

So, if it divides MCLK, and MCLK is not matched to the needed LRCLK (there
is no need, we currently use a 16MHz MCLK) there will almost certainly
be some dead BCLKs, and if LRCLK is used as base for BCLK, there will be
either 8 or 16 dead cycles per channel. In other words, it is difficult
or next to impossible to not get dead cycles with this codec. But it
does I2S correctly.

When you don't know beforehand that the clock slave (Atmel SSC) is lying
and isn't supporting I2S even though it claims so, it turned out to be a
huge time sink for me when I tried to bring up this new codec (max9860).
I naturally thought that the problem was with my new code and not the
old (assumed mature and mostly free of silly bugs) code. Especially for
something as well understood as I2S. I was baffled when I realized what
the problem was, it wasn't even on my map. I'm probably naive...

Cheers,
Peter



Re: [PATCHv2] musb_host: fix lockup on rxcsr_h_error

2016-04-27 Thread Bin Liu
Hi,

On Wed, Apr 27, 2016 at 02:13:56PM -0500, Bin Liu wrote:
> Hi,
> 
> On Wed, Apr 27, 2016 at 09:26:10PM +0300, Maxim Uvarov wrote:
> > 2016-04-27 18:46 GMT+03:00 Bin Liu :
> > > Hi,
> > >
> > > On Wed, Apr 27, 2016 at 09:51:58AM +0300, Max Uvarov wrote:
> > >> Fix soft lockup when resetting remote device attached
> > >> to usb host. Configuration:
> > >> pppd -> musb hub -> usb-serial -> gsm modem
> > >
> > > I have heard a few reports similar to this symptom, but never been able
> > > to reproduce it on my side.
> > >
> > 
> > Ok, I can reproduce it almost very easy.
> > 
> > >> When gsm modem resets, musb rolls in incoming rx interrupts
> > >> which does not give any time to other application as result
> > >> it totally lock ups. Solution is to keep original logic for RXCSR_H_ERROR
> > >
> > > Have you looked where exact place in the interrupt routine the execution
> > > has stuck in?
> > >
> > 
> > It does not stuck. It goes to that line which print proto error over
> > and over again and
> > nothing stops that. After some time kernel reports lockup. But
> > actually it's not stuck,
> > all cpu time was eaten by executing that handlers.
> > 
> > 
> > >> and merge RXCSR_DATAERROR and RXCSR_H_ERROR branches to call same code
> > >> for setting rx stall with MUSB_RXCSR_H_WZC_BITS.
> > >
> > > MUSB_RXCSR_H_WZC_BITS itself does not set rx stall, it just ensures
> > > MUSB_RXCSR_H_RXSTALL not to be cleared. Please check its comment in
> > > musb_regs.h.
> > >
> > >>
> > >> Signed-off-by: Max Uvarov 
> > >> ---
> > >>  v2: use bitwise or for error flags before logical and. (Sergei 
> > >> Shtylyov).
> > >>
> > >>  drivers/usb/musb/musb_host.c | 12 +---
> > >>  1 file changed, 5 insertions(+), 7 deletions(-)
> > >>
> > >> diff --git a/drivers/usb/musb/musb_host.c b/drivers/usb/musb/musb_host.c
> > >> index c3d5fc9..2d9aa78 100644
> > >> --- a/drivers/usb/musb/musb_host.c
> > >> +++ b/drivers/usb/musb/musb_host.c
> > >> @@ -1592,14 +1592,12 @@ void musb_host_rx(struct musb *musb, u8 epnum)
> > >
> > > What kernel do you use? This line # is away off from upstream kernel.
> > >
> > 
> > I did this patch for 4.1 but 4.6 has the same problem and patch
> > cleanly applies to the latest torvalds/linux.git v4.6-rc5. This
> > interrupt handler has the same code.  And looks like on 3.14
> 
> Yeah, this code hasn't been chaned for year. But in general, it is
> prepfered to create patches on latest kernel to avoid other headache.
> 
> > everything worked. I don't have a time to diff 2 versions. Might be
> > regression.
> > 
> > 
> > >>
> > >>   /* stall; record URB status */
> > >>   status = -EPIPE;
> > >> + } else if (rx_csr & (MUSB_RXCSR_DATAERROR | MUSB_RXCSR_H_ERROR)) {
> > >>
> > >> - } else if (rx_csr & MUSB_RXCSR_H_ERROR) {
> > >> - dev_dbg(musb->controller, "end %d RX proto error\n", 
> > >> epnum);
> > >> -
> > >> - status = -EPROTO;
> > >> - musb_writeb(epio, MUSB_RXINTERVAL, 0);
> > >> -
> > >> - } else if (rx_csr & MUSB_RXCSR_DATAERROR) {
> > >> + if (rx_csr & MUSB_RXCSR_H_ERROR) {
> > >> + status = -EPROTO;
> > >> + musb_writeb(epio, MUSB_RXINTERVAL, 0);
> > >> + }
> > >
> > > Please help me to understand how this change fixes the issue. I see the
> > > most effect of the change here is directly 'goto finish' so that 'done'
> > > flag is not set, then musb_advance_schedule() is not called. Is this the
> > > case or I missed other important pieces?
> > >
> > 
> > Right that is the goal. On this rxcsr_h_error kernel reschedules
> > current interrupt.  And that continues forever. For example adding
> 
> The MUSB Programming Guide says CPU should clear this MUSB_RXCSR_H_ERROR
> bit, but the current driver doesn't. I am wondering if this causes the
> controller keeps generating the same interrupt. Can you please try the
> following change instead to see if the lockup goes away?
> 
> @@ -1870,6 +1870,9 @@ void musb_host_rx(struct musb *musb, u8 epnum)
> status = -EPROTO;
> musb_writeb(epio, MUSB_RXINTERVAL, 0);
>  
> +   rx_csr &= ~MUSB_RXCSR_H_ERROR;
> +   musb_writew(epio, MUSB_RXCSR, rx_csr);

+   goto finish;

Please also add the line above. I will spend more time to understand
what is happening...

First of all, I don't like the idea of merging the two branches, it
makes the code ugly.

Regards,
-Bin.

> +
> } else if (rx_csr & MUSB_RXCSR_DATAERROR) {
>  
> if (USB_ENDPOINT_XFER_ISOC != qh->type) {
> 
> Regards,
> -Bin.
> 
> > msleep() can give some time for other processes. I'm not an expert in
> > this chip but I think that right solution in that case is not try to
> > reschedule and quick and allow hub to make reset and once again init
> > all devices (in my case ppp/pppd also shutdowns and then I bring
> > everything up with script.). 

Re: [PATCHv2] musb_host: fix lockup on rxcsr_h_error

2016-04-27 Thread Bin Liu
Hi,

On Wed, Apr 27, 2016 at 02:13:56PM -0500, Bin Liu wrote:
> Hi,
> 
> On Wed, Apr 27, 2016 at 09:26:10PM +0300, Maxim Uvarov wrote:
> > 2016-04-27 18:46 GMT+03:00 Bin Liu :
> > > Hi,
> > >
> > > On Wed, Apr 27, 2016 at 09:51:58AM +0300, Max Uvarov wrote:
> > >> Fix soft lockup when resetting remote device attached
> > >> to usb host. Configuration:
> > >> pppd -> musb hub -> usb-serial -> gsm modem
> > >
> > > I have heard a few reports similar to this symptom, but never been able
> > > to reproduce it on my side.
> > >
> > 
> > Ok, I can reproduce it almost very easy.
> > 
> > >> When gsm modem resets, musb rolls in incoming rx interrupts
> > >> which does not give any time to other application as result
> > >> it totally lock ups. Solution is to keep original logic for RXCSR_H_ERROR
> > >
> > > Have you looked where exact place in the interrupt routine the execution
> > > has stuck in?
> > >
> > 
> > It does not stuck. It goes to that line which print proto error over
> > and over again and
> > nothing stops that. After some time kernel reports lockup. But
> > actually it's not stuck,
> > all cpu time was eaten by executing that handlers.
> > 
> > 
> > >> and merge RXCSR_DATAERROR and RXCSR_H_ERROR branches to call same code
> > >> for setting rx stall with MUSB_RXCSR_H_WZC_BITS.
> > >
> > > MUSB_RXCSR_H_WZC_BITS itself does not set rx stall, it just ensures
> > > MUSB_RXCSR_H_RXSTALL not to be cleared. Please check its comment in
> > > musb_regs.h.
> > >
> > >>
> > >> Signed-off-by: Max Uvarov 
> > >> ---
> > >>  v2: use bitwise or for error flags before logical and. (Sergei 
> > >> Shtylyov).
> > >>
> > >>  drivers/usb/musb/musb_host.c | 12 +---
> > >>  1 file changed, 5 insertions(+), 7 deletions(-)
> > >>
> > >> diff --git a/drivers/usb/musb/musb_host.c b/drivers/usb/musb/musb_host.c
> > >> index c3d5fc9..2d9aa78 100644
> > >> --- a/drivers/usb/musb/musb_host.c
> > >> +++ b/drivers/usb/musb/musb_host.c
> > >> @@ -1592,14 +1592,12 @@ void musb_host_rx(struct musb *musb, u8 epnum)
> > >
> > > What kernel do you use? This line # is away off from upstream kernel.
> > >
> > 
> > I did this patch for 4.1 but 4.6 has the same problem and patch
> > cleanly applies to the latest torvalds/linux.git v4.6-rc5. This
> > interrupt handler has the same code.  And looks like on 3.14
> 
> Yeah, this code hasn't been chaned for year. But in general, it is
> prepfered to create patches on latest kernel to avoid other headache.
> 
> > everything worked. I don't have a time to diff 2 versions. Might be
> > regression.
> > 
> > 
> > >>
> > >>   /* stall; record URB status */
> > >>   status = -EPIPE;
> > >> + } else if (rx_csr & (MUSB_RXCSR_DATAERROR | MUSB_RXCSR_H_ERROR)) {
> > >>
> > >> - } else if (rx_csr & MUSB_RXCSR_H_ERROR) {
> > >> - dev_dbg(musb->controller, "end %d RX proto error\n", 
> > >> epnum);
> > >> -
> > >> - status = -EPROTO;
> > >> - musb_writeb(epio, MUSB_RXINTERVAL, 0);
> > >> -
> > >> - } else if (rx_csr & MUSB_RXCSR_DATAERROR) {
> > >> + if (rx_csr & MUSB_RXCSR_H_ERROR) {
> > >> + status = -EPROTO;
> > >> + musb_writeb(epio, MUSB_RXINTERVAL, 0);
> > >> + }
> > >
> > > Please help me to understand how this change fixes the issue. I see the
> > > most effect of the change here is directly 'goto finish' so that 'done'
> > > flag is not set, then musb_advance_schedule() is not called. Is this the
> > > case or I missed other important pieces?
> > >
> > 
> > Right that is the goal. On this rxcsr_h_error kernel reschedules
> > current interrupt.  And that continues forever. For example adding
> 
> The MUSB Programming Guide says CPU should clear this MUSB_RXCSR_H_ERROR
> bit, but the current driver doesn't. I am wondering if this causes the
> controller keeps generating the same interrupt. Can you please try the
> following change instead to see if the lockup goes away?
> 
> @@ -1870,6 +1870,9 @@ void musb_host_rx(struct musb *musb, u8 epnum)
> status = -EPROTO;
> musb_writeb(epio, MUSB_RXINTERVAL, 0);
>  
> +   rx_csr &= ~MUSB_RXCSR_H_ERROR;
> +   musb_writew(epio, MUSB_RXCSR, rx_csr);

+   goto finish;

Please also add the line above. I will spend more time to understand
what is happening...

First of all, I don't like the idea of merging the two branches, it
makes the code ugly.

Regards,
-Bin.

> +
> } else if (rx_csr & MUSB_RXCSR_DATAERROR) {
>  
> if (USB_ENDPOINT_XFER_ISOC != qh->type) {
> 
> Regards,
> -Bin.
> 
> > msleep() can give some time for other processes. I'm not an expert in
> > this chip but I think that right solution in that case is not try to
> > reschedule and quick and allow hub to make reset and once again init
> > all devices (in my case ppp/pppd also shutdowns and then I bring
> > everything up with script.). The same behavior with dma and pio 

<    1   2   3   4   5   6   7   8   9   10   >