[Cluster-devel] [GFS2 PATCH 4/6] gfs2: introduce new gfs2_glock_assert_withdraw

2020-06-05 Thread Bob Peterson
Before this patch, asserts based on glocks did not print the glock with
the error. This patch introduces a new macro, gfs2_glock_assert_withdraw
which first prints the glock, then takes the assert.

This also changes a few glock asserts to the new macro.

Signed-off-by: Bob Peterson 
---
 fs/gfs2/glock.c | 7 ---
 fs/gfs2/glock.h | 9 +
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9a5dadc93cfc..64541d8bf9ad 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -164,7 +164,7 @@ void gfs2_glock_free(struct gfs2_glock *gl)
 {
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 
-   BUG_ON(atomic_read(>gl_revokes));
+   gfs2_glock_assert_withdraw(gl, atomic_read(>gl_revokes) == 0);
rhashtable_remove_fast(_hash_table, >gl_node, ht_parms);
smp_mb();
wake_up_glock(gl);
@@ -626,7 +626,8 @@ __acquires(>gl_lockref.lock)
 */
if ((atomic_read(>gl_ail_count) != 0) &&
(!cmpxchg(>sd_log_error, 0, -EIO))) {
-   gfs2_assert_warn(sdp, !atomic_read(>gl_ail_count));
+   gfs2_glock_assert_warn(gl,
+  !atomic_read(>gl_ail_count));
gfs2_dump_glock(NULL, gl, true);
}
glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : 
DIO_METADATA);
@@ -1836,7 +1837,7 @@ void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
int ret;
 
ret = gfs2_truncatei_resume(ip);
-   gfs2_assert_withdraw(gl->gl_name.ln_sbd, ret == 0);
+   gfs2_glock_assert_withdraw(gl, ret == 0);
 
spin_lock(>gl_lockref.lock);
clear_bit(GLF_LOCK, >gl_flags);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index b8adaf80e4c5..1c547dff7c57 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -205,6 +205,15 @@ extern void gfs2_dump_glock(struct seq_file *seq, struct 
gfs2_glock *gl,
 #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { \
gfs2_dump_glock(NULL, gl, true);\
BUG(); } } while(0)
+#define gfs2_glock_assert_warn(gl, x) do { if (unlikely(!(x))) {   \
+   gfs2_dump_glock(NULL, gl, true);\
+   gfs2_assert_warn((gl)->gl_name.ln_sbd, (x)); } } \
+   while (0)
+#define gfs2_glock_assert_withdraw(gl, x) do { if (unlikely(!(x))) {   \
+   gfs2_dump_glock(NULL, gl, true);\
+   gfs2_assert_withdraw((gl)->gl_name.ln_sbd, (x)); } } \
+   while (0)
+
 extern __printf(2, 3)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 
-- 
2.26.2



[Cluster-devel] [GFS2 PATCH 1/6] gfs2: Add new sysfs file for gfs2 status

2020-06-05 Thread Bob Peterson
This patch adds a new file: /sys/fs/gfs2/*/status which will report
the status of the file system. Catting this file dumps the current
status of the file system according to various superblock variables.
For example:

Journal Checked:   1
Journal Live:  1
Withdrawn: 0
No barriers:   0
No recovery:   0
Demote:0
No Journal ID: 1
RO Recovery:   0
Skip DLM Unlock:   0
Force AIL Flush:   0
FS Frozen: 0
Withdrawing:   0
Withdraw In Prog:  0
Remote Withdraw:   0
Withdraw Recovery: 0
sd_log_lock held:  0
statfs_spin held:  0
sd_rindex_spin:0
sd_jindex_spin:0
sd_trunc_lock: 0
sd_bitmap_lock:0
sd_ordered_lock:   0
sd_ail_lock:   0
sd_log_error:  0
sd_log_flush_lock: 0
sd_log_flush_lock: 1
sd_log_in_flight:  1

Signed-off-by: Bob Peterson 
---
 fs/gfs2/sys.c | 65 +++
 1 file changed, 65 insertions(+)

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d28c41bd69b0..572353f0b41f 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -63,6 +63,69 @@ static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
MAJOR(sdp->sd_vfs->s_dev), MINOR(sdp->sd_vfs->s_dev));
 }
 
+static ssize_t status_show(struct gfs2_sbd *sdp, char *buf)
+{
+   unsigned long f = sdp->sd_flags;
+   ssize_t s;
+
+   s = snprintf(buf, PAGE_SIZE,
+"Journal Checked:   %d\n"
+"Journal Live:  %d\n"
+"Withdrawn: %d\n"
+"No barriers:   %d\n"
+"No recovery:   %d\n"
+"Demote:%d\n"
+"No Journal ID: %d\n"
+"RO Recovery:   %d\n"
+"Skip DLM Unlock:   %d\n"
+"Force AIL Flush:   %d\n"
+"FS Frozen: %d\n"
+"Withdrawing:   %d\n"
+"Withdraw In Prog:  %d\n"
+"Remote Withdraw:   %d\n"
+"Withdraw Recovery: %d\n"
+"sd_log_lock held:  %d\n"
+"statfs_spin held:  %d\n"
+"sd_rindex_spin:%d\n"
+"sd_jindex_spin:%d\n"
+"sd_trunc_lock: %d\n"
+"sd_bitmap_lock:%d\n"
+"sd_ordered_lock:   %d\n"
+"sd_ail_lock:   %d\n"
+"sd_log_error:  %d\n"
+"sd_log_flush_lock: %d\n"
+"sd_log_num_revoke: %u\n"
+"sd_log_in_flight:  %d\n",
+test_bit(SDF_JOURNAL_CHECKED, ),
+test_bit(SDF_JOURNAL_LIVE, ),
+test_bit(SDF_WITHDRAWN, ),
+test_bit(SDF_NOBARRIERS, ),
+test_bit(SDF_NORECOVERY, ),
+test_bit(SDF_DEMOTE, ),
+test_bit(SDF_NOJOURNALID, ),
+test_bit(SDF_RORECOVERY, ),
+test_bit(SDF_SKIP_DLM_UNLOCK, ),
+test_bit(SDF_FORCE_AIL_FLUSH, ),
+test_bit(SDF_FS_FROZEN, ),
+test_bit(SDF_WITHDRAWING, ),
+test_bit(SDF_WITHDRAW_IN_PROG, ),
+test_bit(SDF_REMOTE_WITHDRAW, ),
+test_bit(SDF_WITHDRAW_RECOVERY, ),
+spin_is_locked(>sd_log_lock),
+spin_is_locked(>sd_statfs_spin),
+spin_is_locked(>sd_rindex_spin),
+spin_is_locked(>sd_jindex_spin),
+spin_is_locked(>sd_trunc_lock),
+spin_is_locked(>sd_bitmap_lock),
+spin_is_locked(>sd_ordered_lock),
+spin_is_locked(>sd_ail_lock),
+sdp->sd_log_error,
+rwsem_is_locked(>sd_log_flush_lock),
+sdp->sd_log_num_revoke,
+atomic_read(>sd_log_in_flight));
+   return s;
+}
+
 static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
 {
return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname);
@@ -283,6 +346,7 @@ GFS2_ATTR(quota_sync,  0200, NULL,  
quota_sync_store);
 GFS2_ATTR(quota_refresh_user,  0200, NULL,  quota_refresh_user_store);
 GFS2_ATTR(quota_refresh_group, 0200, NULL,  quota_refresh_group_store);
 GFS2_ATTR(demote_rq,   0200, NULL, demote_rq_store);
+GFS2_ATTR(status,  0444, status_show,   NULL);
 
 static struct attribute *gfs2_attrs[] = {
_attr_id.attr,
@@ -295,6 +359,7 @@ static struct attribute *gfs2_attrs[] = {
_attr_quota_refresh_user.attr,
_attr_quota_refresh_group.attr,
_attr_demote_rq.attr,
+   _attr_status.attr,
NULL,
 };
 ATTRIBUTE_GROUPS(gfs2);
-- 
2.26.2



[Cluster-devel] [GFS2 PATCH 6/6] gfs2: fix use-after-free on transaction ail lists

2020-06-05 Thread Bob Peterson
Before this patch, transactions could be merged into the system
transaction by function gfs2_merge_trans(), but then it is not
considered attached. The caller, log_refund, only sets TR_ATTACHED
if the transaction is not merged. Later, in function, gfs2_trans_end,
if TR_ATTACHED is not set (because it was merged) the transaction is
freed by gfs2_trans_free(). However, by the time this happens, the
log flush mechanism may have queued bd elements to the ail list,
which means tr_ail1_list may not be empty, and the flush may forget
about the bd elements altogether (memory leak, because it focuses
on the sdp transaction instead) or worse, reference them after the
transaction has been freed.

Although I've not seen any serious consequences, the problem becomes
apparent with the previous patch's addition of:

gfs2_assert_warn(sdp, list_empty(>tr_ail1_list));

to function gfs2_trans_free().

This patch adds logic into gfs2_merge_trans() to move the merged
transaction's ail lists to the sdp transaction. This prevents the
use-after-free.

To facilitate this, we pass sdp into the function instead of the
transaction itself.

Signed-off-by: Bob Peterson 
---
 fs/gfs2/log.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 64efa3f743af..5a2bc70b0859 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -1020,8 +1020,10 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct 
gfs2_glock *gl, u32 flags)
  * @new: New transaction to be merged
  */
 
-static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new)
+static void gfs2_merge_trans(struct gfs2_sbd *sdp, struct gfs2_trans *new)
 {
+   struct gfs2_trans *old = sdp->sd_log_tr;
+
WARN_ON_ONCE(!test_bit(TR_ATTACHED, >tr_flags));
 
old->tr_num_buf_new += new->tr_num_buf_new;
@@ -1033,6 +1035,11 @@ static void gfs2_merge_trans(struct gfs2_trans *old, 
struct gfs2_trans *new)
 
list_splice_tail_init(>tr_databuf, >tr_databuf);
list_splice_tail_init(>tr_buf, >tr_buf);
+
+   spin_lock(>sd_ail_lock);
+   list_splice_tail_init(>tr_ail1_list, >tr_ail1_list);
+   list_splice_tail_init(>tr_ail2_list, >tr_ail2_list);
+   spin_unlock(>sd_ail_lock);
 }
 
 static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
@@ -1044,7 +1051,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct 
gfs2_trans *tr)
gfs2_log_lock(sdp);
 
if (sdp->sd_log_tr) {
-   gfs2_merge_trans(sdp->sd_log_tr, tr);
+   gfs2_merge_trans(sdp, tr);
} else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) {
gfs2_assert_withdraw(sdp, test_bit(TR_ALLOCED, >tr_flags));
sdp->sd_log_tr = tr;
-- 
2.26.2



Re: [Cluster-devel] [PATCH 6/8] gfs2: instrumentation wrt log_flush stuck

2020-06-05 Thread Bob Peterson
Hi Andreas,

- Original Message -
(snip)
> > @@ -970,7 +969,16 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct
> > gfs2_glock *gl, u32 flags)
> >
> > if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) {
> > if (!sdp->sd_log_idle) {
> > +   unsigned long start = jiffies;
> > +
> > for (;;) {
> > +   if (time_after(jiffies, start + (HZ *
> > 600))) {
> 
> This should probably have some rate limiting as well, for example:

Seems unnecessary. If the log flush gets stuck, the message will be printed
once, and at most every 10 minutes.
 
> start = jiffies;
> 
> I'm not sure what provides similar rate limiting in gfs2_ail1_flush.
> 
> > +   fs_err(sdp, "Error: In
> > gfs2_log_flush "
> > +  "for 10 minutes! t=%d\n",
> > +  current->journal_info ? 1 :
> > 0);
> 
> Please don't break the format string up like that; this makes grepping
> harder.

How do you propose I break present it without going over 80 character?
I could #define it as a constant, or put it into a separate function
that has less indentation, I suppose.
 
Bob



Re: [Cluster-devel] [PATCH 6/8] gfs2: instrumentation wrt log_flush stuck

2020-06-05 Thread Andreas Gruenbacher
On Fri, Jun 5, 2020 at 4:49 PM Bob Peterson  wrote:
> Hi Andreas,
>
> - Original Message -
> (snip)
> > > @@ -970,7 +969,16 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct
> > > gfs2_glock *gl, u32 flags)
> > >
> > > if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) {
> > > if (!sdp->sd_log_idle) {
> > > +   unsigned long start = jiffies;
> > > +
> > > for (;;) {
> > > +   if (time_after(jiffies, start + (HZ *
> > > 600))) {
> >
> > This should probably have some rate limiting as well, for example:
>
> Seems unnecessary. If the log flush gets stuck, the message will be printed
> once, and at most every 10 minutes.

No, after ten minutes, the message will actually be printed for each
iteration of the loop. That's exactly why I was suggesting the rate
limiting.

> > start = jiffies;
> >
> > I'm not sure what provides similar rate limiting in gfs2_ail1_flush.
> >
> > > +   fs_err(sdp, "Error: In
> > > gfs2_log_flush "
> > > +  "for 10 minutes! t=%d\n",
> > > +  current->journal_info ? 1 :
> > > 0);
> >
> > Please don't break the format string up like that; this makes grepping
> > harder.
>
> How do you propose I break present it without going over 80 character?
> I could #define it as a constant, or put it into a separate function
> that has less indentation, I suppose.

There *is* no hard 80 character limit. The checkpatch warnings
shouldn't push us into making our code worse. Also note that since
commit bdc48fa11e, checkpatch will only warn about lines longer than
100 characters.

Thanks,
Andreas



[Cluster-devel] [GFS2 PATCH 5/6] gfs2: new slab for transactions

2020-06-05 Thread Bob Peterson
This patch adds a new slab for gfs2 transactions. That allows us to
have an initialization function and protect against some errors.

Signed-off-by: Bob Peterson 
---
 fs/gfs2/glops.c |  2 ++
 fs/gfs2/log.c   |  9 +
 fs/gfs2/main.c  |  9 +
 fs/gfs2/trans.c | 22 ++
 fs/gfs2/trans.h |  1 +
 fs/gfs2/util.c  |  1 +
 fs/gfs2/util.h  |  1 +
 7 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 4862dae868a2..224fb3bd503c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -91,6 +91,8 @@ static int gfs2_ail_empty_gl(struct gfs2_glock *gl)
memset(, 0, sizeof(tr));
INIT_LIST_HEAD(_buf);
INIT_LIST_HEAD(_databuf);
+   INIT_LIST_HEAD(_ail1_list);
+   INIT_LIST_HEAD(_ail2_list);
tr.tr_revokes = atomic_read(>gl_ail_count);
 
if (!tr.tr_revokes) {
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index fcc7f58d74f0..64efa3f743af 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -30,6 +30,7 @@
 #include "util.h"
 #include "dir.h"
 #include "trace_gfs2.h"
+#include "trans.h"
 
 static void gfs2_log_shutdown(struct gfs2_sbd *sdp);
 
@@ -378,7 +379,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int 
new_tail)
list_del(>tr_list);
gfs2_assert_warn(sdp, list_empty(>tr_ail1_list));
gfs2_assert_warn(sdp, list_empty(>tr_ail2_list));
-   kfree(tr);
+   gfs2_trans_free(sdp, tr);
}
 
spin_unlock(>sd_ail_lock);
@@ -863,14 +864,14 @@ static void ail_drain(struct gfs2_sbd *sdp)
gfs2_ail_empty_tr(sdp, tr, >tr_ail1_list);
gfs2_ail_empty_tr(sdp, tr, >tr_ail2_list);
list_del(>tr_list);
-   kfree(tr);
+   gfs2_trans_free(sdp, tr);
}
while (!list_empty(>sd_ail2_list)) {
tr = list_first_entry(>sd_ail2_list, struct gfs2_trans,
  tr_list);
gfs2_ail_empty_tr(sdp, tr, >tr_ail2_list);
list_del(>tr_list);
-   kfree(tr);
+   gfs2_trans_free(sdp, tr);
}
spin_unlock(>sd_ail_lock);
 }
@@ -1010,7 +1011,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct 
gfs2_glock *gl, u32 flags)
trace_gfs2_log_flush(sdp, 0, flags);
up_write(>sd_log_flush_lock);
 
-   kfree(tr);
+   gfs2_trans_free(sdp, tr);
 }
 
 /**
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a1a295b739fb..733470ca6be9 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -143,6 +143,12 @@ static int __init init_gfs2_fs(void)
if (!gfs2_qadata_cachep)
goto fail_cachep7;
 
+   gfs2_trans_cachep = kmem_cache_create("gfs2_trans",
+  sizeof(struct gfs2_trans),
+  0, 0, NULL);
+   if (!gfs2_trans_cachep)
+   goto fail_cachep8;
+
error = register_shrinker(_qd_shrinker);
if (error)
goto fail_shrinker;
@@ -194,6 +200,8 @@ static int __init init_gfs2_fs(void)
 fail_fs1:
unregister_shrinker(_qd_shrinker);
 fail_shrinker:
+   kmem_cache_destroy(gfs2_trans_cachep);
+fail_cachep8:
kmem_cache_destroy(gfs2_qadata_cachep);
 fail_cachep7:
kmem_cache_destroy(gfs2_quotad_cachep);
@@ -236,6 +244,7 @@ static void __exit exit_gfs2_fs(void)
rcu_barrier();
 
mempool_destroy(gfs2_page_pool);
+   kmem_cache_destroy(gfs2_trans_cachep);
kmem_cache_destroy(gfs2_qadata_cachep);
kmem_cache_destroy(gfs2_quotad_cachep);
kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index ffe840505082..aa7cd0abb369 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -37,7 +37,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int 
blocks,
if (!test_bit(SDF_JOURNAL_LIVE, >sd_flags))
return -EROFS;
 
-   tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
+   tr = kmem_cache_zalloc(gfs2_trans_cachep, GFP_NOFS);
if (!tr)
return -ENOMEM;
 
@@ -53,6 +53,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int 
blocks,
INIT_LIST_HEAD(>tr_databuf);
INIT_LIST_HEAD(>tr_buf);
 
+   INIT_LIST_HEAD(>tr_ail1_list);
+   INIT_LIST_HEAD(>tr_ail2_list);
+
sb_start_intwrite(sdp->sd_vfs);
 
error = gfs2_log_reserve(sdp, tr->tr_reserved);
@@ -65,7 +68,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int 
blocks,
 
 fail:
sb_end_intwrite(sdp->sd_vfs);
-   kfree(tr);
+   kmem_cache_free(gfs2_trans_cachep, tr);
 
return error;
 }
@@ -93,7 +96,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
if (!test_bit(TR_TOUCHED, >tr_flags)) {
gfs2_log_release(sdp, tr->tr_reserved);
if (alloced) {
-   kfree(tr);
+   gfs2_trans_free(sdp, tr);

[Cluster-devel] [GFS2 PATCH 0/6 v2] Misc Patch Collection

2020-06-05 Thread Bob Peterson
I've revised my recent patch set based on feedback from Andreas.
Here is my latest. Changes from last time are:

1. Removed the spin_trylock related info from the status sysfs file
   as per Andreas's feedback. Although valuable to have, I see his point.
2. Removed "gfs2: Add new trace point for glock ail management".
   I still think this would be useful to have, but perhaps only compiled
   in with a new debug kernel option. For now, leave it out.
3. Removed "gfs2: add memory barriers to gfs2_glock_remove_revoke".
   It is not needed.
4. Removed "gfs2: Only do glock put in gfs2_create_inode for free inodes".
   It was pushed to the repo separately.
5. Revised "gfs2: instrumentation wrt log_flush stuck" moving that code
   to a new function, which makes the err string contiguous for grep.
6. Added "gfs2: new slab for transactions" which is revised to fix bugs
   that were in the old version.
7. Added "gfs2: fix use-after-free on transaction ail lists" that fixes
   a problem that comes to light with the previous patch.
8. Patches 1/8 and 2/8 are omitted, since they were pushed separately.

Bob Peterson (6):
  gfs2: Add new sysfs file for gfs2 status
  gfs2: print mapping->nrpages in glock dump for address space glocks
  gfs2: instrumentation wrt log_flush stuck
  gfs2: introduce new gfs2_glock_assert_withdraw
  gfs2: new slab for transactions
  gfs2: fix use-after-free on transaction ail lists

 fs/gfs2/glock.c | 32 +++-
 fs/gfs2/glock.h |  9 +++
 fs/gfs2/glops.c |  2 ++
 fs/gfs2/log.c   | 54 
 fs/gfs2/main.c  |  9 +++
 fs/gfs2/sys.c   | 65 +
 fs/gfs2/trans.c | 22 ++---
 fs/gfs2/trans.h |  1 +
 fs/gfs2/util.c  |  1 +
 fs/gfs2/util.h  |  1 +
 10 files changed, 165 insertions(+), 31 deletions(-)

-- 
2.26.2



Re: [Cluster-devel] [PATCH 7/8] gfs2: Add new trace point for glock ail management

2020-06-05 Thread Andreas Gruenbacher
Bob,

this very much looks like a debugging trace point that has no business
in production code.

Thanks,
Andreas



[Cluster-devel] [GFS2 PATCH v3] gfs2: new slab for transactions

2020-06-05 Thread Bob Peterson
Hi,

I revised the patch description for Andreas:

This patch adds a new slab for gfs2 transactions. That allows us to
reduce kernel memory fragmentation, have better organization of data
for analysis of vmcore dumps. A new centralized function is added to
free the slab objects, and it exposes use-after-free by giving
warnings if a transaction is freed while it still has bd elements
attached to its buffers or ail lists. We make sure to initialize
those transaction ail lists so we can check their integrity when freeing.

At a later time, we should add a slab initialization function to
make it more efficient, but for this initial patch I wanted to
minimize the impact.

Signed-off-by: Bob Peterson 
---
 fs/gfs2/glops.c |  2 ++
 fs/gfs2/log.c   |  9 +
 fs/gfs2/main.c  |  9 +
 fs/gfs2/trans.c | 22 ++
 fs/gfs2/trans.h |  1 +
 fs/gfs2/util.c  |  1 +
 fs/gfs2/util.h  |  1 +
 7 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 4862dae868a2..224fb3bd503c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -91,6 +91,8 @@ static int gfs2_ail_empty_gl(struct gfs2_glock *gl)
memset(, 0, sizeof(tr));
INIT_LIST_HEAD(_buf);
INIT_LIST_HEAD(_databuf);
+   INIT_LIST_HEAD(_ail1_list);
+   INIT_LIST_HEAD(_ail2_list);
tr.tr_revokes = atomic_read(>gl_ail_count);
 
if (!tr.tr_revokes) {
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index fcc7f58d74f0..64efa3f743af 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -30,6 +30,7 @@
 #include "util.h"
 #include "dir.h"
 #include "trace_gfs2.h"
+#include "trans.h"
 
 static void gfs2_log_shutdown(struct gfs2_sbd *sdp);
 
@@ -378,7 +379,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int 
new_tail)
list_del(>tr_list);
gfs2_assert_warn(sdp, list_empty(>tr_ail1_list));
gfs2_assert_warn(sdp, list_empty(>tr_ail2_list));
-   kfree(tr);
+   gfs2_trans_free(sdp, tr);
}
 
spin_unlock(>sd_ail_lock);
@@ -863,14 +864,14 @@ static void ail_drain(struct gfs2_sbd *sdp)
gfs2_ail_empty_tr(sdp, tr, >tr_ail1_list);
gfs2_ail_empty_tr(sdp, tr, >tr_ail2_list);
list_del(>tr_list);
-   kfree(tr);
+   gfs2_trans_free(sdp, tr);
}
while (!list_empty(>sd_ail2_list)) {
tr = list_first_entry(>sd_ail2_list, struct gfs2_trans,
  tr_list);
gfs2_ail_empty_tr(sdp, tr, >tr_ail2_list);
list_del(>tr_list);
-   kfree(tr);
+   gfs2_trans_free(sdp, tr);
}
spin_unlock(>sd_ail_lock);
 }
@@ -1010,7 +1011,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct 
gfs2_glock *gl, u32 flags)
trace_gfs2_log_flush(sdp, 0, flags);
up_write(>sd_log_flush_lock);
 
-   kfree(tr);
+   gfs2_trans_free(sdp, tr);
 }
 
 /**
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a1a295b739fb..733470ca6be9 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -143,6 +143,12 @@ static int __init init_gfs2_fs(void)
if (!gfs2_qadata_cachep)
goto fail_cachep7;
 
+   gfs2_trans_cachep = kmem_cache_create("gfs2_trans",
+  sizeof(struct gfs2_trans),
+  0, 0, NULL);
+   if (!gfs2_trans_cachep)
+   goto fail_cachep8;
+
error = register_shrinker(_qd_shrinker);
if (error)
goto fail_shrinker;
@@ -194,6 +200,8 @@ static int __init init_gfs2_fs(void)
 fail_fs1:
unregister_shrinker(_qd_shrinker);
 fail_shrinker:
+   kmem_cache_destroy(gfs2_trans_cachep);
+fail_cachep8:
kmem_cache_destroy(gfs2_qadata_cachep);
 fail_cachep7:
kmem_cache_destroy(gfs2_quotad_cachep);
@@ -236,6 +244,7 @@ static void __exit exit_gfs2_fs(void)
rcu_barrier();
 
mempool_destroy(gfs2_page_pool);
+   kmem_cache_destroy(gfs2_trans_cachep);
kmem_cache_destroy(gfs2_qadata_cachep);
kmem_cache_destroy(gfs2_quotad_cachep);
kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index ffe840505082..aa7cd0abb369 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -37,7 +37,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int 
blocks,
if (!test_bit(SDF_JOURNAL_LIVE, >sd_flags))
return -EROFS;
 
-   tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
+   tr = kmem_cache_zalloc(gfs2_trans_cachep, GFP_NOFS);
if (!tr)
return -ENOMEM;
 
@@ -53,6 +53,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int 
blocks,
INIT_LIST_HEAD(>tr_databuf);
INIT_LIST_HEAD(>tr_buf);
 
+   INIT_LIST_HEAD(>tr_ail1_list);
+   INIT_LIST_HEAD(>tr_ail2_list);
+
sb_start_intwrite(sdp->sd_vfs);
 
error = 

Re: [Cluster-devel] [PATCH 6/8] gfs2: instrumentation wrt log_flush stuck

2020-06-05 Thread Bob Peterson
- Original Message -
> On Fri, Jun 5, 2020 at 4:49 PM Bob Peterson  wrote:
> > Hi Andreas,
> >
> > - Original Message -
> > (snip)
> > > > @@ -970,7 +969,16 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct
> > > > gfs2_glock *gl, u32 flags)
> > > >
> > > > if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) {
> > > > if (!sdp->sd_log_idle) {
> > > > +   unsigned long start = jiffies;
> > > > +
> > > > for (;;) {
> > > > +   if (time_after(jiffies, start + (HZ *
> > > > 600))) {
> > >
> > > This should probably have some rate limiting as well, for example:
> >
> > Seems unnecessary. If the log flush gets stuck, the message will be printed
> > once, and at most every 10 minutes.
> 
> No, after ten minutes, the message will actually be printed for each
> iteration of the loop. That's exactly why I was suggesting the rate
> limiting.

No, after ten minutes it dumps the ail list so you can see the problem
and exits the loop with "break;".

The next time it enters the loop, it starts with a new value of start
which doesn't expire for another ten minutes.

Bob



Re: [Cluster-devel] [PATCH 6/8] gfs2: instrumentation wrt log_flush stuck

2020-06-05 Thread Andreas Gruenbacher
On Fri, Jun 5, 2020 at 6:15 PM Bob Peterson  wrote:
> - Original Message -
> > On Fri, Jun 5, 2020 at 4:49 PM Bob Peterson  wrote:
> > > Hi Andreas,
> > >
> > > - Original Message -
> > > (snip)
> > > > > @@ -970,7 +969,16 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct
> > > > > gfs2_glock *gl, u32 flags)
> > > > >
> > > > > if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) {
> > > > > if (!sdp->sd_log_idle) {
> > > > > +   unsigned long start = jiffies;
> > > > > +
> > > > > for (;;) {
> > > > > +   if (time_after(jiffies, start + (HZ *
> > > > > 600))) {
> > > >
> > > > This should probably have some rate limiting as well, for example:
> > >
> > > Seems unnecessary. If the log flush gets stuck, the message will be 
> > > printed
> > > once, and at most every 10 minutes.
> >
> > No, after ten minutes, the message will actually be printed for each
> > iteration of the loop. That's exactly why I was suggesting the rate
> > limiting.
>
> No, after ten minutes it dumps the ail list so you can see the problem
> and exits the loop with "break;".
>
> The next time it enters the loop, it starts with a new value of start
> which doesn't expire for another ten minutes.

Ok, I misread.

Thanks,
Andreas



Re: [Cluster-devel] [PATCH 0/8] Misc Patch Collection

2020-06-05 Thread Andreas Gruenbacher
Hi Bob,

On Tue, May 26, 2020 at 3:07 PM Bob Peterson  wrote:
> Andreas expressed some concerns about some of the others. For example, he
> didn't like that the new "status" sysfs file was taking "try" locks, but
> if the lock is held, I don't know of a better way to do this.

walking a linked list that's protected by a spin lock without holding
that spin lock is so fundamentally broken that I don't see why we even
need to discuss it. There's nothing that guarantees that we'll be able
to walk the list.

The spin_is_locked checks this patch adds for reporting the states of
spin locks are practically useless as well.

> He also expressed
> a concern that the new file should be in debugfs rather than sysfs.
> I'm open to opinions. Regardless of where it is, the new debug file is a
> perfect candidate to include in sos reports.

It may be better to include more relevant information in trace points.
That way, we automatically get that information correlated with other
trace events to see what was happening when.

Thanks,
Andreas



Re: [Cluster-devel] [PATCH 6/8] gfs2: instrumentation wrt log_flush stuck

2020-06-05 Thread Andreas Gruenbacher
Hi Bob,

On Tue, May 26, 2020 at 3:05 PM Bob Peterson  wrote:
> This adds checks for gfs2_log_flush being stuck, similarly to the check
> in gfs2_ail1_flush.
>
> Signed-off-by: Bob Peterson 
> ---
>  fs/gfs2/log.c | 14 +++---
>  1 file changed, 11 insertions(+), 3 deletions(-)
>
> diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
> index 1d51b4781bdd..636c82dda68b 100644
> --- a/fs/gfs2/log.c
> +++ b/fs/gfs2/log.c
> @@ -145,9 +145,6 @@ static void dump_ail_list(struct gfs2_sbd *sdp)
> struct gfs2_bufdata *bd;
> struct buffer_head *bh;
>
> -   fs_err(sdp, "Error: In gfs2_ail1_flush for ten minutes! t=%d\n",
> -  current->journal_info ? 1 : 0);
> -
> list_for_each_entry_reverse(tr, >sd_ail1_list, tr_list) {
> list_for_each_entry_reverse(bd, >tr_ail1_list,
> bd_ail_st_list) {
> @@ -197,6 +194,8 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct 
> writeback_control *wbc)
>  restart:
> ret = 0;
> if (time_after(jiffies, flush_start + (HZ * 600))) {
> +   fs_err(sdp, "Error: In gfs2_ail1_flush for ten minutes! "
> +  "t=%d\n", current->journal_info ? 1 : 0);
> dump_ail_list(sdp);
> goto out;
> }
> @@ -970,7 +969,16 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct 
> gfs2_glock *gl, u32 flags)
>
> if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) {
> if (!sdp->sd_log_idle) {
> +   unsigned long start = jiffies;
> +
> for (;;) {
> +   if (time_after(jiffies, start + (HZ * 600))) {

This should probably have some rate limiting as well, for example:

start = jiffies;

I'm not sure what provides similar rate limiting in gfs2_ail1_flush.

> +   fs_err(sdp, "Error: In gfs2_log_flush 
> "
> +  "for 10 minutes! t=%d\n",
> +  current->journal_info ? 1 : 0);

Please don't break the format string up like that; this makes grepping harder.

> +   dump_ail_list(sdp);
> +   break;
> +   }
> gfs2_ail1_start(sdp);
> gfs2_ail1_wait(sdp);
> if (gfs2_ail1_empty(sdp, 0))
> --
> 2.26.2
>



[Cluster-devel] [GFS2 PATCH 2/6] gfs2: print mapping->nrpages in glock dump for address space glocks

2020-06-05 Thread Bob Peterson
This patch makes the glock dumps in debugfs print the number of pages
(nrpages) for address space glocks. This will aid in debugging.

Signed-off-by: Bob Peterson 
---
 fs/gfs2/glock.c | 25 -
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index bf70e3b14938..9a5dadc93cfc 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1978,7 +1978,13 @@ void gfs2_dump_glock(struct seq_file *seq, struct 
gfs2_glock *gl, bool fsid)
char gflags_buf[32];
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
+   unsigned long nrpages = 0;
 
+   if (gl->gl_ops->go_flags & GLOF_ASPACE) {
+   struct address_space *mapping = gfs2_glock2aspace(gl);
+
+   nrpages = mapping->nrpages;
+   }
memset(fs_id_buf, 0, sizeof(fs_id_buf));
if (fsid && sdp) /* safety precaution */
sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
@@ -1987,15 +1993,16 @@ void gfs2_dump_glock(struct seq_file *seq, struct 
gfs2_glock *gl, bool fsid)
if (!test_bit(GLF_DEMOTE, >gl_flags))
dtime = 0;
gfs2_print_dbg(seq, "%sG:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d "
-  "v:%d r:%d m:%ld\n", fs_id_buf, state2str(gl->gl_state),
- gl->gl_name.ln_type,
- (unsigned long long)gl->gl_name.ln_number,
- gflags2str(gflags_buf, gl),
- state2str(gl->gl_target),
- state2str(gl->gl_demote_state), dtime,
- atomic_read(>gl_ail_count),
- atomic_read(>gl_revokes),
- (int)gl->gl_lockref.count, gl->gl_hold_time);
+  "v:%d r:%d m:%ld p:%lu\n",
+  fs_id_buf, state2str(gl->gl_state),
+  gl->gl_name.ln_type,
+  (unsigned long long)gl->gl_name.ln_number,
+  gflags2str(gflags_buf, gl),
+  state2str(gl->gl_target),
+  state2str(gl->gl_demote_state), dtime,
+  atomic_read(>gl_ail_count),
+  atomic_read(>gl_revokes),
+  (int)gl->gl_lockref.count, gl->gl_hold_time, nrpages);
 
list_for_each_entry(gh, >gl_holders, gh_list)
dump_holder(seq, gh, fs_id_buf);
-- 
2.26.2



[Cluster-devel] [GFS2 PATCH 3/6] gfs2: instrumentation wrt log_flush stuck

2020-06-05 Thread Bob Peterson
This adds checks for gfs2_log_flush being stuck, similarly to the check
in gfs2_ail1_flush. To faciliate this and make the strings easy to grep
we move the ail1 emptying to its own function, empty_ail1_list.

Signed-off-by: Bob Peterson 
---
 fs/gfs2/log.c | 34 +-
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 0644e58c6191..fcc7f58d74f0 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -145,9 +145,6 @@ static void dump_ail_list(struct gfs2_sbd *sdp)
struct gfs2_bufdata *bd;
struct buffer_head *bh;
 
-   fs_err(sdp, "Error: In gfs2_ail1_flush for ten minutes! t=%d\n",
-  current->journal_info ? 1 : 0);
-
list_for_each_entry_reverse(tr, >sd_ail1_list, tr_list) {
list_for_each_entry_reverse(bd, >tr_ail1_list,
bd_ail_st_list) {
@@ -197,6 +194,8 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct 
writeback_control *wbc)
 restart:
ret = 0;
if (time_after(jiffies, flush_start + (HZ * 600))) {
+   fs_err(sdp, "Error: In %s for ten minutes! t=%d\n",
+  __func__, current->journal_info ? 1 : 0);
dump_ail_list(sdp);
goto out;
}
@@ -876,6 +875,28 @@ static void ail_drain(struct gfs2_sbd *sdp)
spin_unlock(>sd_ail_lock);
 }
 
+/**
+ * empty_ail1_list - try to start IO and empty the ail1 list
+ * @sdp: Pointer to GFS2 superblock
+ */
+static void empty_ail1_list(struct gfs2_sbd *sdp)
+{
+   unsigned long start = jiffies;
+
+   for (;;) {
+   if (time_after(jiffies, start + (HZ * 600))) {
+   fs_err(sdp, "Error: In %s for 10 minutes! t=%d\n",
+  __func__, current->journal_info ? 1 : 0);
+   dump_ail_list(sdp);
+   return;
+   }
+   gfs2_ail1_start(sdp);
+   gfs2_ail1_wait(sdp);
+   if (gfs2_ail1_empty(sdp, 0))
+   return;
+   }
+}
+
 /**
  * gfs2_log_flush - flush incore transaction(s)
  * @sdp: the filesystem
@@ -965,12 +986,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct 
gfs2_glock *gl, u32 flags)
 
if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) {
if (!sdp->sd_log_idle) {
-   for (;;) {
-   gfs2_ail1_start(sdp);
-   gfs2_ail1_wait(sdp);
-   if (gfs2_ail1_empty(sdp, 0))
-   break;
-   }
+   empty_ail1_list(sdp);
if (gfs2_withdrawn(sdp))
goto out;
atomic_dec(>sd_log_blks_free); /* Adjust for 
unreserved buffer */
-- 
2.26.2



[Cluster-devel] [GFS2 PATCH v2] gfs2: new slab for transactions

2020-06-05 Thread Bob Peterson
Hi,

I've posted similar patches for this in the past, but they all had
shortcomings. This is my latest version. There is a follow-on patch
which I will post shortly. In the future, I want to add a new
slab initialization function, but for now this is enough.

Note that the use-after-free warnings in gfs2_trans_free() require
that we now properly initialize the transaction ail lists.

Bob Peterson

...

This patch adds a new slab for gfs2 transactions. That allows us to
have an initialization function and protect against some errors.

Signed-off-by: Bob Peterson 
---
 fs/gfs2/glops.c |  2 ++
 fs/gfs2/log.c   |  9 +
 fs/gfs2/main.c  |  9 +
 fs/gfs2/trans.c | 22 ++
 fs/gfs2/trans.h |  1 +
 fs/gfs2/util.c  |  1 +
 fs/gfs2/util.h  |  1 +
 7 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 9e9c7a4b8c66..0e4e6570adef 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -91,6 +91,8 @@ static int gfs2_ail_empty_gl(struct gfs2_glock *gl)
memset(, 0, sizeof(tr));
INIT_LIST_HEAD(_buf);
INIT_LIST_HEAD(_databuf);
+   INIT_LIST_HEAD(_ail1_list);
+   INIT_LIST_HEAD(_ail2_list);
tr.tr_revokes = atomic_read(>gl_ail_count);
 
if (!tr.tr_revokes) {
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index d6e5734651a4..79aa0fb4984e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -30,6 +30,7 @@
 #include "util.h"
 #include "dir.h"
 #include "trace_gfs2.h"
+#include "trans.h"
 
 static void gfs2_log_shutdown(struct gfs2_sbd *sdp);
 
@@ -379,7 +380,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int 
new_tail)
list_del(>tr_list);
gfs2_assert_warn(sdp, list_empty(>tr_ail1_list));
gfs2_assert_warn(sdp, list_empty(>tr_ail2_list));
-   kfree(tr);
+   gfs2_trans_free(sdp, tr);
}
 
spin_unlock(>sd_ail_lock);
@@ -868,14 +869,14 @@ static void ail_drain(struct gfs2_sbd *sdp)
gfs2_ail_empty_tr(sdp, tr, >tr_ail1_list);
gfs2_ail_empty_tr(sdp, tr, >tr_ail2_list);
list_del(>tr_list);
-   kfree(tr);
+   gfs2_trans_free(sdp, tr);
}
while (!list_empty(>sd_ail2_list)) {
tr = list_first_entry(>sd_ail2_list, struct gfs2_trans,
  tr_list);
gfs2_ail_empty_tr(sdp, tr, >tr_ail2_list);
list_del(>tr_list);
-   kfree(tr);
+   gfs2_trans_free(sdp, tr);
}
spin_unlock(>sd_ail_lock);
 }
@@ -1007,7 +1008,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct 
gfs2_glock *gl, u32 flags)
trace_gfs2_log_flush(sdp, 0, flags);
up_write(>sd_log_flush_lock);
 
-   kfree(tr);
+   gfs2_trans_free(sdp, tr);
 }
 
 /**
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a1a295b739fb..733470ca6be9 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -143,6 +143,12 @@ static int __init init_gfs2_fs(void)
if (!gfs2_qadata_cachep)
goto fail_cachep7;
 
+   gfs2_trans_cachep = kmem_cache_create("gfs2_trans",
+  sizeof(struct gfs2_trans),
+  0, 0, NULL);
+   if (!gfs2_trans_cachep)
+   goto fail_cachep8;
+
error = register_shrinker(_qd_shrinker);
if (error)
goto fail_shrinker;
@@ -194,6 +200,8 @@ static int __init init_gfs2_fs(void)
 fail_fs1:
unregister_shrinker(_qd_shrinker);
 fail_shrinker:
+   kmem_cache_destroy(gfs2_trans_cachep);
+fail_cachep8:
kmem_cache_destroy(gfs2_qadata_cachep);
 fail_cachep7:
kmem_cache_destroy(gfs2_quotad_cachep);
@@ -236,6 +244,7 @@ static void __exit exit_gfs2_fs(void)
rcu_barrier();
 
mempool_destroy(gfs2_page_pool);
+   kmem_cache_destroy(gfs2_trans_cachep);
kmem_cache_destroy(gfs2_qadata_cachep);
kmem_cache_destroy(gfs2_quotad_cachep);
kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 9aed58f7368d..348ecd3fc167 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -37,7 +37,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int 
blocks,
if (!test_bit(SDF_JOURNAL_LIVE, >sd_flags))
return -EROFS;
 
-   tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
+   tr = kmem_cache_zalloc(gfs2_trans_cachep, GFP_NOFS);
if (!tr)
return -ENOMEM;
 
@@ -53,6 +53,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int 
blocks,
INIT_LIST_HEAD(>tr_databuf);
INIT_LIST_HEAD(>tr_buf);
 
+   INIT_LIST_HEAD(>tr_ail1_list);
+   INIT_LIST_HEAD(>tr_ail2_list);
+
sb_start_intwrite(sdp->sd_vfs);
 
error = gfs2_log_reserve(sdp, tr->tr_reserved);
@@ -65,7 +68,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int 
blocks,
 
 fail:
 

[Cluster-devel] [PATCH AUTOSEL 5.4 10/14] gfs2: Even more gfs2_find_jhead fixes

2020-06-05 Thread Sasha Levin
From: Andreas Gruenbacher 

[ Upstream commit 20be493b787cd581c9fffad7fcd6bfbe6af1050c ]

Fix several issues in the previous gfs2_find_jhead fix:
* When updating @blocks_submitted, @block refers to the first block block not
  submitted yet, not the last block submitted, so fix an off-by-one error.
* We want to ensure that @blocks_submitted is far enough ahead of @blocks_read
  to guarantee that there is in-flight I/O.  Otherwise, we'll eventually end up
  waiting for pages that haven't been submitted, yet.
* It's much easier to compare the number of blocks added with the number of
  blocks submitted to limit the maximum bio size.
* Even with bio chaining, we can keep adding blocks until we reach the maximum
  bio size, as long as we stop at a page boundary.  This simplifies the logic.

Signed-off-by: Andreas Gruenbacher 
Reviewed-by: Bob Peterson 
Signed-off-by: Sasha Levin 
---
 fs/gfs2/lops.c | 15 +--
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 8303b44a5068..d2ed4dc4434c 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -504,12 +504,12 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct 
gfs2_log_header_host *head,
unsigned int bsize = sdp->sd_sb.sb_bsize, off;
unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
unsigned int shift = PAGE_SHIFT - bsize_shift;
-   unsigned int max_bio_size = 2 * 1024 * 1024;
+   unsigned int max_blocks = 2 * 1024 * 1024 >> bsize_shift;
struct gfs2_journal_extent *je;
int sz, ret = 0;
struct bio *bio = NULL;
struct page *page = NULL;
-   bool bio_chained = false, done = false;
+   bool done = false;
errseq_t since;
 
memset(head, 0, sizeof(*head));
@@ -532,10 +532,7 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct 
gfs2_log_header_host *head,
off = 0;
}
 
-   if (!bio || (bio_chained && !off) ||
-   bio->bi_iter.bi_size >= max_bio_size) {
-   /* start new bio */
-   } else {
+   if (bio && (off || block < blocks_submitted + 
max_blocks)) {
sector_t sector = dblock << 
sdp->sd_fsb2bb_shift;
 
if (bio_end_sector(bio) == sector) {
@@ -548,19 +545,17 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct 
gfs2_log_header_host *head,
(PAGE_SIZE - off) >> 
bsize_shift;
 
bio = gfs2_chain_bio(bio, blocks);
-   bio_chained = true;
goto add_block_to_new_bio;
}
}
 
if (bio) {
-   blocks_submitted = block + 1;
+   blocks_submitted = block;
submit_bio(bio);
}
 
bio = gfs2_log_alloc_bio(sdp, dblock, 
gfs2_end_log_read);
bio->bi_opf = REQ_OP_READ;
-   bio_chained = false;
 add_block_to_new_bio:
sz = bio_add_page(bio, page, bsize, off);
BUG_ON(sz != bsize);
@@ -568,7 +563,7 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct 
gfs2_log_header_host *head,
off += bsize;
if (off == PAGE_SIZE)
page = NULL;
-   if (blocks_submitted < 2 * max_bio_size >> bsize_shift) 
{
+   if (blocks_submitted <= blocks_read + max_blocks) {
/* Keep at least one bio in flight */
continue;
}
-- 
2.25.1




[Cluster-devel] [PATCH AUTOSEL 5.6 12/17] gfs2: Even more gfs2_find_jhead fixes

2020-06-05 Thread Sasha Levin
From: Andreas Gruenbacher 

[ Upstream commit 20be493b787cd581c9fffad7fcd6bfbe6af1050c ]

Fix several issues in the previous gfs2_find_jhead fix:
* When updating @blocks_submitted, @block refers to the first block block not
  submitted yet, not the last block submitted, so fix an off-by-one error.
* We want to ensure that @blocks_submitted is far enough ahead of @blocks_read
  to guarantee that there is in-flight I/O.  Otherwise, we'll eventually end up
  waiting for pages that haven't been submitted, yet.
* It's much easier to compare the number of blocks added with the number of
  blocks submitted to limit the maximum bio size.
* Even with bio chaining, we can keep adding blocks until we reach the maximum
  bio size, as long as we stop at a page boundary.  This simplifies the logic.

Signed-off-by: Andreas Gruenbacher 
Reviewed-by: Bob Peterson 
Signed-off-by: Sasha Levin 
---
 fs/gfs2/lops.c | 15 +--
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 3a020bdc358c..966ed37c9acd 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -505,12 +505,12 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct 
gfs2_log_header_host *head,
unsigned int bsize = sdp->sd_sb.sb_bsize, off;
unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
unsigned int shift = PAGE_SHIFT - bsize_shift;
-   unsigned int max_bio_size = 2 * 1024 * 1024;
+   unsigned int max_blocks = 2 * 1024 * 1024 >> bsize_shift;
struct gfs2_journal_extent *je;
int sz, ret = 0;
struct bio *bio = NULL;
struct page *page = NULL;
-   bool bio_chained = false, done = false;
+   bool done = false;
errseq_t since;
 
memset(head, 0, sizeof(*head));
@@ -533,10 +533,7 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct 
gfs2_log_header_host *head,
off = 0;
}
 
-   if (!bio || (bio_chained && !off) ||
-   bio->bi_iter.bi_size >= max_bio_size) {
-   /* start new bio */
-   } else {
+   if (bio && (off || block < blocks_submitted + 
max_blocks)) {
sector_t sector = dblock << 
sdp->sd_fsb2bb_shift;
 
if (bio_end_sector(bio) == sector) {
@@ -549,19 +546,17 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct 
gfs2_log_header_host *head,
(PAGE_SIZE - off) >> 
bsize_shift;
 
bio = gfs2_chain_bio(bio, blocks);
-   bio_chained = true;
goto add_block_to_new_bio;
}
}
 
if (bio) {
-   blocks_submitted = block + 1;
+   blocks_submitted = block;
submit_bio(bio);
}
 
bio = gfs2_log_alloc_bio(sdp, dblock, 
gfs2_end_log_read);
bio->bi_opf = REQ_OP_READ;
-   bio_chained = false;
 add_block_to_new_bio:
sz = bio_add_page(bio, page, bsize, off);
BUG_ON(sz != bsize);
@@ -569,7 +564,7 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct 
gfs2_log_header_host *head,
off += bsize;
if (off == PAGE_SIZE)
page = NULL;
-   if (blocks_submitted < 2 * max_bio_size >> bsize_shift) 
{
+   if (blocks_submitted <= blocks_read + max_blocks) {
/* Keep at least one bio in flight */
continue;
}
-- 
2.25.1




[Cluster-devel] [gfs2 patch] gfs2: fix use-after-free on transaction ail lists

2020-06-05 Thread Bob Peterson
Hi,

Before this patch, transactions could be merged into the system
transaction by function gfs2_merge_trans(), but then it is not
considered attached. The caller, log_refund, only sets TR_ATTACHED
if the transaction is not merged. Later, in function, gfs2_trans_end,
if TR_ATTACHED is not set (because it was merged) the transaction is
freed by gfs2_trans_free(). However, by the time this happens, the
log flush mechanism may have queued bd elements to the ail list,
which means tr_ail1_list may not be empty, and the flush may forget
about the bd elements altogether (memory leak) or worse, reference
them after the transaction has been freed.

This patch adds logic into gfs2_merge_trans() to move the merged
transaction's ail lists to the sdp transaction. This prevents the
use-after-free.

To facilitate this, we pass sdp into the function instead of the
transaction itself.

Signed-off-by: Bob Peterson 
---
 fs/gfs2/log.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 79aa0fb4984e..91e1eebc5ee0 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -1017,8 +1017,10 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct 
gfs2_glock *gl, u32 flags)
  * @new: New transaction to be merged
  */
 
-static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new)
+static void gfs2_merge_trans(struct gfs2_sbd *sdp, struct gfs2_trans *new)
 {
+   struct gfs2_trans *old = sdp->sd_log_tr;
+
WARN_ON_ONCE(!test_bit(TR_ATTACHED, >tr_flags));
 
old->tr_num_buf_new += new->tr_num_buf_new;
@@ -1030,6 +1032,11 @@ static void gfs2_merge_trans(struct gfs2_trans *old, 
struct gfs2_trans *new)
 
list_splice_tail_init(>tr_databuf, >tr_databuf);
list_splice_tail_init(>tr_buf, >tr_buf);
+
+   spin_lock(>sd_ail_lock);
+   list_splice_tail_init(>tr_ail1_list, >tr_ail1_list);
+   list_splice_tail_init(>tr_ail2_list, >tr_ail2_list);
+   spin_unlock(>sd_ail_lock);
 }
 
 static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
@@ -1041,7 +1048,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct 
gfs2_trans *tr)
gfs2_log_lock(sdp);
 
if (sdp->sd_log_tr) {
-   gfs2_merge_trans(sdp->sd_log_tr, tr);
+   gfs2_merge_trans(sdp, tr);
} else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) {
gfs2_assert_withdraw(sdp, test_bit(TR_ALLOCED, >tr_flags));
sdp->sd_log_tr = tr;