Re: [Yaffs] [PATCH] yaffs: fix spinning when flush inodes

Attachments:
Message as email (text/plain) delayed-iput_2.patch (text/x-patch)

Author: Mihai Ene
Date:
To: yaffs
Subject: Re: [Yaffs] [PATCH] yaffs: fix spinning when flush inodes

Thanks for this patch.
I think we have the same problem. See:
http://lists.aleph1.co.uk/lurker/message/20141013.115546.aaeb2d16.en.html.
I also believe that the problem is because of the infinite loop from
yaffs_flush_inodes
since an element points to itself. The question is how do we end in this
situation (and it
happens only in multiprocessor situation; maxcpu=1 will not have any
problem)?

Now, while your patch doesn't answer my question, I still have tested it
on my system
and it works fine so far.

In my investigation I notice that a printk as first line in
yaffs_flush_super will not reproduce
I eliminated the printk from your patch in order to not affect timings.
This is also running fine.

I have a patch that I want to share with you (see attach). Comments are
in the patch.
I would appreciate any feedback.

--Mihai

On 10/28/2014 03:54 PM, Stefan Agner wrote:
> While in list_for_each_entry() of yaffs_flush_inodes, the fs code
> can delete inodes. This leads to an endless loop which causes a
> softlockup. Typically this happend in sync_supers when creating
> and deleting files while under CPU load.
>
> This fix checks whether we get twice the same inode. If this is
> true, we just retry again.
>
> This is an alternative fix to the proposed fix Jisheng Zhang:
> yaffs: fix softlockup cauesed by inode deleted when scanning s_inodes list
> http://www.aleph1.co.uk/lurker/message/20110831.075307.3cfeacdf.fr.html
> ---
> Hi,
>
> I sent this email already some weeks ago, however it did not make it
> through to the list. Hence this resend.
>
> I can see Jisheng's issue (see link above) on a Tegra 2 device (Colibri
> T20) using 3.1 L4T (Linux for Tegra) Kernel. The devices which show the
> error had multiple YAFFS2 partitions. I could successfully reproduce the
> issue by using bonnie++ on two independent partitions formatted YAFFS2.
> After some hours (3-5) the sync_supers kernel thread starts to loop in
> the list_for_each_entry of the yaffs_flush_inodes function.
>
> I also set the dirty_writeback_centiseconds to 1, but I'm not sure if
> this really forces the error to happen more often.
> echo 1 > /proc/sys/vm/dirty_writeback_centisecs
>
> However, JiSheng's patch leads to different errors on my setup, e.g.
> kernel BUG at ../fs/inode.c:1359!
> kernel BUG at ../fs/inode.c:431!
>
> This is a rather ugly fix which just works around the actual symptoms,
> but it solved the problem for me so far.
>
> --
> Stefan
>
> fs/yaffs2/yaffs_vfs.c | 16 +++++++++++++++-
> 1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/fs/yaffs2/yaffs_vfs.c b/fs/yaffs2/yaffs_vfs.c
> index 6d4136d..16502e6 100644
> --- a/fs/yaffs2/yaffs_vfs.c
> +++ b/fs/yaffs2/yaffs_vfs.c
> @@ -1520,9 +1520,11 @@ static int yaffs_statfs(struct dentry *dentry, struct kstatfs *buf)
>
> static void yaffs_flush_inodes(struct super_block *sb) > { > - struct inode *iptr; > + struct inode *iptr, *iptr_tmp; > struct yaffs_obj *obj;

>
> +retry: > + iptr_tmp = NULL; > list_for_each_entry(iptr, &sb->s_inodes, i_sb_list) { > obj = yaffs_inode_to_obj(iptr); > if (obj) { > @@ -1530,6 +1532,18 @@ static void yaffs_flush_inodes(struct super_block *sb) > "flushing obj %d", obj->obj_id); > yaffs_flush_file(obj, 1, 0); > } > + > + /* > + * HACK: if we get the same iptr twice, someone removed (?) > + * this inode while we are iterating. Start over again > + */ > + if (iptr_tmp == iptr) { > + printk(KERN_ERR "yaffs: Got twice the same inode %p\n", > + iptr); > + goto retry; > + } > + > + iptr_tmp = iptr; > } > }

>

diff -Naur a/fs/yaffs2/yaffs_vfs.c b/fs/yaffs2/yaffs_vfs.c

--- a/fs/yaffs2/yaffs_vfs.c    2014-10-24 09:16:44.754318000 +0200
+++ b/fs/yaffs2/yaffs_vfs.c    2014-10-27 16:36:37.290006000 +0100
@@ -57,6 +57,8 @@
 #include <linux/uaccess.h>
 #include <linux/mtd/mtd.h>

+#include "../internal.h"
+
#include "yportenv.h"
#include "yaffs_trace.h"
#include "yaffs_guts.h"
@@ -81,6 +83,12 @@
module_param(yaffs_bg_enable, uint, 0644);
module_param(yaffs_auto_select, uint, 0644);

+/* Simple inode list. */
+typedef struct {
+    struct inode *inode;
+    struct list_head list;
+} yaffs_inode_list_t;
+
 #define yaffs_devname(sb, buf)    bdevname(sb->s_bdev, buf)

 static uint32_t YCALCBLOCKS(uint64_t partition_size, uint32_t block_size)
@@ -1488,29 +1496,204 @@
     return 0;
 }

-static void yaffs_flush_inodes(struct super_block *sb)
+
+static int yaffs_delay_iput(struct list_head *delayed_iput_list, struct inode *inode) {
+    // GFP_NOWAIT - as we have spinlocks - we can't wait
+    // and we need to allocate memory before iget()
+    // to be able to add iget()'ed reference to list
+    yaffs_inode_list_t *allocated = kmalloc(sizeof(yaffs_inode_list_t), GFP_NOWAIT);
+    if (unlikely(allocated == NULL)) {
+        printk(KERN_ERR "yaffs has no memory to allocate item for delayed iput\n");
+        // no memory - return error
+        return -ENOMEM;
+    }
+    allocated->inode = inode;
+    INIT_LIST_HEAD(&(allocated->list));
+    list_add(&allocated->list, delayed_iput_list);
+    return 0;
+}
+
+
+static void yaffs_iput_inodes(struct list_head *delayed_iput_list) {
+    yaffs_inode_list_t *item, *next;
+    list_for_each_entry_safe(item, next, delayed_iput_list, list) {
+        iput(item->inode);
+        list_del_init(&item->list);
+        kfree(item);
+    }
+}
+
+
+static void yaffs_flush_inodes(struct super_block *sb, struct list_head* delayed_iput_list)
 {
-    struct inode *iptr;
+    struct inode *inode;
     struct yaffs_obj *obj;

-    list_for_each_entry(iptr, &sb->s_inodes, i_sb_list) {
-        obj = yaffs_inode_to_obj(iptr);
-        if (obj) {
-            yaffs_trace(YAFFS_TRACE_OS,
-                "flushing obj %d", obj->obj_id);
-            yaffs_flush_file(obj, 1, 0);
+    /* 
+     * This lock is needed to have exclusive access to inodes list in sb (superblock)
+     * Precisely this lock protects pointers linking elements of list.
+     * This lock is used only during switch from current inode to next one.
+     * So during processing of current inode, list manipulation is allowed.
+     * In effect after flush performed by this loop it is not guaranted
+     * that all inodes at that time are flushed. New nodes could have been added
+     * or old removed.
+     */
+    spin_lock(&inode_sb_list_lock);
+    list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+
+        obj = yaffs_inode_to_obj(inode);
+
+        /*
+         * Actual work on current node during iteration
+         * is performed by yaffs_flush_file().
+         * This function doesn't process nodes without yaffs_obj attached
+         * and also those with attached one but not in dirty state.
+         * So to move forward quickly we can do this checks now
+         * and continue interation when conditions are not met.
+         */
+        if (!obj || !obj->dirty) {
+            continue;
         }
+
+        /* 
+         * This spinlock protects i_state and i_count which we need.
+         * There are inode states that are not suitable for flushing.
+         * Precisely I_NEW state which means 'inode under construction'
+         * And I_FREEING which means 'inode beeing evicted'
+         * i_count is incremented by __iget() and we use that to
+         * mark node as 'used' during flushing to prevent evict
+         * beeing triggered when i_count reaches 0.
+         */
+        spin_lock(&inode->i_lock);
+
+        /*
+         * More on i_state states:
+         *
+         * We cannot __iget() an inode in state I_FREEING
+         * this state is set in inode.c in iput_final()
+         * just befor call to evict() which in turn
+         * removes inode from superblocks list of inodes 
+         * __iget() won't prevent this in any way.
+         * If indode is dirty at that time in must be handled
+         * in yaffs_evict_inode() callback - not here
+         *
+         * I_WILL_FREE is not possible state for yaffs
+         * In inode.c in iput_final() drop variable is always 1 (true)
+         * so this state won't ever be set.
+         * This is in turn because yaffs doesn't provide
+         * drop_inode() callback.
+         * Default one: generic_drop_inode() defaults to 1 (true)
+         * when there are no users for inode (i_count is 0).
+         * So it is skipped as yaffs doesn't make use of it.
+         * Eventually in the future when yaffs will do
+         * writing node schould be handled by write_inode_now()
+         *
+         * I_NEW is set by iget_locked() called from yaffs_iget()
+         * during creation of inode.
+         * Then gross lock is locked and yaffs creates new yaffs_obj
+         * for inode. Then grosslock is unlocked and I_NEW is cleared
+         * by unlock_new_inode() to finish inode creation
+         * This loop can reach I_NEW inodes and there
+         * would be two cases:
+         * 1) inode in I_NEW state but without yaffs_obj
+         *    this one would be skipped
+         * 2) i node in I_NEW state with new yaffs_obj
+         *    just before clearing I_NEW state
+         * But as I_NEW marks inode creation and can be
+         * extended in future it would be good to skip it also
+         * as it means inode under construction
+         *
+         * So this states are always skipped during iteration
+         */
+        if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+            spin_unlock(&inode->i_lock);
+            continue;
+        }
+
+        /*
+         * "If i_count is zero, the inode cannot have any watches and
+         *  doing an __iget/iput with MS_ACTIVE clear would actually
+         *  evict all inodes with zero i_count from icache which is
+         *  unnecessarily violent and may in fact be illegal to do."
+         *                             found in fs/notify/inode_mark.c
+         *
+         * I guess that this refers to logic from iput_final()
+         * to cases when drop is 0 (false) for nodes with i_count == 0
+         * As mentioned earlier for yaffs this cases are not possible.
+         * For yaffs nodes with i_count == 0 are always evicted.
+         * Precisely when i_count == 0 i_state is changed to I_FREEING
+         * under protection of i_lock (see iput() ... iput_final())
+         * and we skip such nodes as beeing removed and let yaffs_evict_inode()
+         * do the job.
+         *
+         * So this is not important for yaffs
+         */
+        //if (!atomic_read(&inode->i_count)) {
+        //    spin_unlock(&inode->i_lock);
+        //    continue;
+        //}
+        
+        if (unlikely(yaffs_delay_iput(delayed_iput_list, inode))) {
+            // no memory we need to skip this node as we can't iget() it
+            // as we would be not be able to iput() it later
+            spin_unlock(&inode->i_lock);
+            continue; // or break - not sure yet which is better
+        }
+
+        /* 
+         * Increment use counter to preven node removal during flush operation
+         */
+        __iget(inode);
+
+        /* 
+         * Data protected by this lock is not accesed or changed anymore.
+         * inode won't be removed from list because we declared use of it by calling __iget()
+         * */
+        spin_unlock(&inode->i_lock);
+
+        /*
+         * yaffs_flush_file do I/O and can possibly loose CPU this is not allowed
+         * with spinlock beeing locked. We need to release it.
+         * It is done here because:
+         * 1) all quick continue conditions are over current inode is dirty and needs flushing
+         * 2) there is locking order for those two spinlock that we use
+         *    and inode_sb_list_lock need to be unlocked after i_lock
+         */
+        spin_unlock(&inode_sb_list_lock);
+
+        /*
+         * Store this inode on temporary list.
+         * Is is needed beacuse iput() must be called to revert __iget()
+         * but this may lead eventually to yaffs_evict_inode() as we can be last users of it.
+         * But yaffs_evict_inode needs gross lock which we hold.
+         * We need to call iput() later when gross lock is released
+         * This is why this temporary list is beeing used.
+         * Oryginal patch used in WR4.3 was releasing here gross lock to call iput().
+         * This was wrong as in case we were holding last reference
+         * this would immediately trigger iput_final() .. evict() .. yaffs_evict_inode()
+         * which will removed node from list and iteration will fail.
+         */
+
+        yaffs_trace(YAFFS_TRACE_OS,
+            "flushing obj %d", obj->obj_id);
+        yaffs_flush_file(obj, 1, 0);
+
+        /* we need this lock to select next node and continue iteration */
+        spin_lock(&inode_sb_list_lock);
     }
+    /* end of iteration release list lock used to detect end of list */
+    spin_unlock(&inode_sb_list_lock);
 }

-static void yaffs_flush_super(struct super_block *sb, int do_checkpoint)
+
+static void yaffs_flush_super(struct super_block *sb, int do_checkpoint, struct list_head * delayed_iput_list)
 {
     struct yaffs_dev *dev = yaffs_super_to_dev(sb);

     if (!dev)
         return;

-    yaffs_flush_inodes(sb);
+    yaffs_flush_inodes(sb, delayed_iput_list);
     yaffs_update_dirty_dirs(dev);
     yaffs_flush_whole_cache(dev);
     if (do_checkpoint)
@@ -1554,18 +1737,31 @@
         request_checkpoint ? "checkpoint requested" : "no checkpoint",
         oneshot_checkpoint ? " one-shot" : "");

+    /*
+     * Temporary list for delayed iput of nodes 'locked' by __iget()
+     * during flush operation. This list is created on stack to
+     * ensure 1:1 relation between __iget() and iput() calls
+     */
+    LIST_HEAD(delayed_iput_list);
+
     yaffs_gross_lock(dev);
     do_checkpoint = ((request_checkpoint && !gc_urgent) ||
              oneshot_checkpoint) && !dev->is_checkpointed;

     if (sb->s_dirt || do_checkpoint) {
-        yaffs_flush_super(sb, !dev->is_checkpointed && do_checkpoint);
+        yaffs_flush_super(sb, !dev->is_checkpointed && do_checkpoint, &delayed_iput_list);
         sb->s_dirt = 0;
         if (oneshot_checkpoint)
             yaffs_auto_checkpoint &= ~4;
     }
     yaffs_gross_unlock(dev);

+    /*
+     * Call iput() on nodes 'locked' by __iget() by flush operation.
+     * It needs to be done after yaffs_gross_unlock()
+     */
+    yaffs_iput_inodes(&delayed_iput_list);
+
     return 0;
 }

@@ -1927,9 +2123,16 @@
     yaffs_trace(YAFFS_TRACE_OS | YAFFS_TRACE_BACKGROUND,
         "yaffs background thread shut down");

+    /*
+     * Temporary list for delayed iput of nodes 'locked' by __iget()
+     * during flush operation. This list is created on stack to
+     * ensure 1:1 relation between __iget() and iput() calls
+     */
+    LIST_HEAD(delayed_iput_list);
+
     yaffs_gross_lock(dev);

-    yaffs_flush_super(sb, 1);
+    yaffs_flush_super(sb, 1, &delayed_iput_list);

     if (yaffs_dev_to_lc(dev)->put_super_fn)
         yaffs_dev_to_lc(dev)->put_super_fn(sb);
@@ -1937,6 +2140,13 @@
     yaffs_deinitialise(dev);

     yaffs_gross_unlock(dev);
+
+    /*
+     * Call iput() on nodes 'locked' by __iget() by flush operation.
+     * It needs to be done after yaffs_gross_unlock()
+     */
+    yaffs_iput_inodes(&delayed_iput_list);
+
     mutex_lock(&yaffs_context_lock);
     list_del_init(&(yaffs_dev_to_lc(dev)->context_list));
     mutex_unlock(&yaffs_context_lock);

This message is part of the following thread:
	the complete thread tree sorted by date
	Stefan Agner at