Thanks for this patch.
I think we have the same problem. See:
http://lists.aleph1.co.uk/lurker/message/20141013.115546.aaeb2d16.en.html.
I also believe that the problem is because of the infinite loop from
yaffs_flush_inodes
since an element points to itself. The question is how do we end in this
situation (and it
happens only in multiprocessor situation; maxcpu=1 will not have any
problem)?
Now, while your patch doesn't answer my question, I still have tested it
on my system
and it works fine so far.
In my investigation I notice that a printk as first line in
yaffs_flush_super will not reproduce
I eliminated the printk from your patch in order to not affect timings.
This is also running fine.
I have a patch that I want to share with you (see attach). Comments are
in the patch.
I would appreciate any feedback.
--Mihai
On 10/28/2014 03:54 PM, Stefan Agner wrote:
> While in list_for_each_entry() of yaffs_flush_inodes, the fs code
> can delete inodes. This leads to an endless loop which causes a
> softlockup. Typically this happend in sync_supers when creating
> and deleting files while under CPU load.
>
> This fix checks whether we get twice the same inode. If this is
> true, we just retry again.
>
> This is an alternative fix to the proposed fix Jisheng Zhang:
> yaffs: fix softlockup cauesed by inode deleted when scanning s_inodes list
> http://www.aleph1.co.uk/lurker/message/20110831.075307.3cfeacdf.fr.html
> ---
> Hi,
>
> I sent this email already some weeks ago, however it did not make it
> through to the list. Hence this resend.
>
> I can see Jisheng's issue (see link above) on a Tegra 2 device (Colibri
> T20) using 3.1 L4T (Linux for Tegra) Kernel. The devices which show the
> error had multiple YAFFS2 partitions. I could successfully reproduce the
> issue by using bonnie++ on two independent partitions formatted YAFFS2.
> After some hours (3-5) the sync_supers kernel thread starts to loop in
> the list_for_each_entry of the yaffs_flush_inodes function.
>
> I also set the dirty_writeback_centiseconds to 1, but I'm not sure if
> this really forces the error to happen more often.
> echo 1 > /proc/sys/vm/dirty_writeback_centisecs
>
> However, JiSheng's patch leads to different errors on my setup, e.g.
> kernel BUG at ../fs/inode.c:1359!
> kernel BUG at ../fs/inode.c:431!
>
> This is a rather ugly fix which just works around the actual symptoms,
> but it solved the problem for me so far.
>
> --
> Stefan
>
> fs/yaffs2/yaffs_vfs.c | 16 +++++++++++++++-
> 1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/fs/yaffs2/yaffs_vfs.c b/fs/yaffs2/yaffs_vfs.c
> index 6d4136d..16502e6 100644
> --- a/fs/yaffs2/yaffs_vfs.c
> +++ b/fs/yaffs2/yaffs_vfs.c
> @@ -1520,9 +1520,11 @@ static int yaffs_statfs(struct dentry *dentry, struct kstatfs *buf)
>
> static void yaffs_flush_inodes(struct super_block *sb)
> {
> - struct inode *iptr;
> + struct inode *iptr, *iptr_tmp;
> struct yaffs_obj *obj;
>
> +retry:
> + iptr_tmp = NULL;
> list_for_each_entry(iptr, &sb->s_inodes, i_sb_list) {
> obj = yaffs_inode_to_obj(iptr);
> if (obj) {
> @@ -1530,6 +1532,18 @@ static void yaffs_flush_inodes(struct super_block *sb)
> "flushing obj %d", obj->obj_id);
> yaffs_flush_file(obj, 1, 0);
> }
> +
> + /*
> + * HACK: if we get the same iptr twice, someone removed (?)
> + * this inode while we are iterating. Start over again
> + */
> + if (iptr_tmp == iptr) {
> + printk(KERN_ERR "yaffs: Got twice the same inode %p\n",
> + iptr);
> + goto retry;
> + }
> +
> + iptr_tmp = iptr;
> }
> }
>
diff -Naur a/fs/yaffs2/yaffs_vfs.c b/fs/yaffs2/yaffs_vfs.c
--- a/fs/yaffs2/yaffs_vfs.c 2014-10-24 09:16:44.754318000 +0200
+++ b/fs/yaffs2/yaffs_vfs.c 2014-10-27 16:36:37.290006000 +0100
@@ -57,6 +57,8 @@
#include <linux/uaccess.h>
#include <linux/mtd/mtd.h>
+#include "../internal.h"
+
#include "yportenv.h"
#include "yaffs_trace.h"
#include "yaffs_guts.h"
@@ -81,6 +83,12 @@
module_param(yaffs_bg_enable, uint, 0644);
module_param(yaffs_auto_select, uint, 0644);
+/* Simple inode list. */
+typedef struct {
+ struct inode *inode;
+ struct list_head list;
+} yaffs_inode_list_t;
+
#define yaffs_devname(sb, buf) bdevname(sb->s_bdev, buf)
static uint32_t YCALCBLOCKS(uint64_t partition_size, uint32_t block_size)
@@ -1488,29 +1496,204 @@
return 0;
}
-static void yaffs_flush_inodes(struct super_block *sb)
+
+static int yaffs_delay_iput(struct list_head *delayed_iput_list, struct inode *inode) {
+ // GFP_NOWAIT - as we have spinlocks - we can't wait
+ // and we need to allocate memory before iget()
+ // to be able to add iget()'ed reference to list
+ yaffs_inode_list_t *allocated = kmalloc(sizeof(yaffs_inode_list_t), GFP_NOWAIT);
+ if (unlikely(allocated == NULL)) {
+ printk(KERN_ERR "yaffs has no memory to allocate item for delayed iput\n");
+ // no memory - return error
+ return -ENOMEM;
+ }
+ allocated->inode = inode;
+ INIT_LIST_HEAD(&(allocated->list));
+ list_add(&allocated->list, delayed_iput_list);
+ return 0;
+}
+
+
+static void yaffs_iput_inodes(struct list_head *delayed_iput_list) {
+ yaffs_inode_list_t *item, *next;
+ list_for_each_entry_safe(item, next, delayed_iput_list, list) {
+ iput(item->inode);
+ list_del_init(&item->list);
+ kfree(item);
+ }
+}
+
+
+static void yaffs_flush_inodes(struct super_block *sb, struct list_head* delayed_iput_list)
{
- struct inode *iptr;
+ struct inode *inode;
struct yaffs_obj *obj;
- list_for_each_entry(iptr, &sb->s_inodes, i_sb_list) {
- obj = yaffs_inode_to_obj(iptr);
- if (obj) {
- yaffs_trace(YAFFS_TRACE_OS,
- "flushing obj %d", obj->obj_id);
- yaffs_flush_file(obj, 1, 0);
+ /*
+ * This lock is needed to have exclusive access to inodes list in sb (superblock)
+ * Precisely this lock protects pointers linking elements of list.
+ * This lock is used only during switch from current inode to next one.
+ * So during processing of current inode, list manipulation is allowed.
+ * In effect after flush performed by this loop it is not guaranted
+ * that all inodes at that time are flushed. New nodes could have been added
+ * or old removed.
+ */
+ spin_lock(&inode_sb_list_lock);
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+
+ obj = yaffs_inode_to_obj(inode);
+
+ /*
+ * Actual work on current node during iteration
+ * is performed by yaffs_flush_file().
+ * This function doesn't process nodes without yaffs_obj attached
+ * and also those with attached one but not in dirty state.
+ * So to move forward quickly we can do this checks now
+ * and continue interation when conditions are not met.
+ */
+ if (!obj || !obj->dirty) {
+ continue;
}
+
+ /*
+ * This spinlock protects i_state and i_count which we need.
+ * There are inode states that are not suitable for flushing.
+ * Precisely I_NEW state which means 'inode under construction'
+ * And I_FREEING which means 'inode beeing evicted'
+ * i_count is incremented by __iget() and we use that to
+ * mark node as 'used' during flushing to prevent evict
+ * beeing triggered when i_count reaches 0.
+ */
+ spin_lock(&inode->i_lock);
+
+ /*
+ * More on i_state states:
+ *
+ * We cannot __iget() an inode in state I_FREEING
+ * this state is set in inode.c in iput_final()
+ * just befor call to evict() which in turn
+ * removes inode from superblocks list of inodes
+ * __iget() won't prevent this in any way.
+ * If indode is dirty at that time in must be handled
+ * in yaffs_evict_inode() callback - not here
+ *
+ * I_WILL_FREE is not possible state for yaffs
+ * In inode.c in iput_final() drop variable is always 1 (true)
+ * so this state won't ever be set.
+ * This is in turn because yaffs doesn't provide
+ * drop_inode() callback.
+ * Default one: generic_drop_inode() defaults to 1 (true)
+ * when there are no users for inode (i_count is 0).
+ * So it is skipped as yaffs doesn't make use of it.
+ * Eventually in the future when yaffs will do
+ * writing node schould be handled by write_inode_now()
+ *
+ * I_NEW is set by iget_locked() called from yaffs_iget()
+ * during creation of inode.
+ * Then gross lock is locked and yaffs creates new yaffs_obj
+ * for inode. Then grosslock is unlocked and I_NEW is cleared
+ * by unlock_new_inode() to finish inode creation
+ * This loop can reach I_NEW inodes and there
+ * would be two cases:
+ * 1) inode in I_NEW state but without yaffs_obj
+ * this one would be skipped
+ * 2) i node in I_NEW state with new yaffs_obj
+ * just before clearing I_NEW state
+ * But as I_NEW marks inode creation and can be
+ * extended in future it would be good to skip it also
+ * as it means inode under construction
+ *
+ * So this states are always skipped during iteration
+ */
+ if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+
+ /*
+ * "If i_count is zero, the inode cannot have any watches and
+ * doing an __iget/iput with MS_ACTIVE clear would actually
+ * evict all inodes with zero i_count from icache which is
+ * unnecessarily violent and may in fact be illegal to do."
+ * found in fs/notify/inode_mark.c
+ *
+ * I guess that this refers to logic from iput_final()
+ * to cases when drop is 0 (false) for nodes with i_count == 0
+ * As mentioned earlier for yaffs this cases are not possible.
+ * For yaffs nodes with i_count == 0 are always evicted.
+ * Precisely when i_count == 0 i_state is changed to I_FREEING
+ * under protection of i_lock (see iput() ... iput_final())
+ * and we skip such nodes as beeing removed and let yaffs_evict_inode()
+ * do the job.
+ *
+ * So this is not important for yaffs
+ */
+ //if (!atomic_read(&inode->i_count)) {
+ // spin_unlock(&inode->i_lock);
+ // continue;
+ //}
+
+ if (unlikely(yaffs_delay_iput(delayed_iput_list, inode))) {
+ // no memory we need to skip this node as we can't iget() it
+ // as we would be not be able to iput() it later
+ spin_unlock(&inode->i_lock);
+ continue; // or break - not sure yet which is better
+ }
+
+ /*
+ * Increment use counter to preven node removal during flush operation
+ */
+ __iget(inode);
+
+ /*
+ * Data protected by this lock is not accesed or changed anymore.
+ * inode won't be removed from list because we declared use of it by calling __iget()
+ * */
+ spin_unlock(&inode->i_lock);
+
+ /*
+ * yaffs_flush_file do I/O and can possibly loose CPU this is not allowed
+ * with spinlock beeing locked. We need to release it.
+ * It is done here because:
+ * 1) all quick continue conditions are over current inode is dirty and needs flushing
+ * 2) there is locking order for those two spinlock that we use
+ * and inode_sb_list_lock need to be unlocked after i_lock
+ */
+ spin_unlock(&inode_sb_list_lock);
+
+ /*
+ * Store this inode on temporary list.
+ * Is is needed beacuse iput() must be called to revert __iget()
+ * but this may lead eventually to yaffs_evict_inode() as we can be last users of it.
+ * But yaffs_evict_inode needs gross lock which we hold.
+ * We need to call iput() later when gross lock is released
+ * This is why this temporary list is beeing used.
+ * Oryginal patch used in WR4.3 was releasing here gross lock to call iput().
+ * This was wrong as in case we were holding last reference
+ * this would immediately trigger iput_final() .. evict() .. yaffs_evict_inode()
+ * which will removed node from list and iteration will fail.
+ */
+
+ yaffs_trace(YAFFS_TRACE_OS,
+ "flushing obj %d", obj->obj_id);
+ yaffs_flush_file(obj, 1, 0);
+
+ /* we need this lock to select next node and continue iteration */
+ spin_lock(&inode_sb_list_lock);
}
+ /* end of iteration release list lock used to detect end of list */
+ spin_unlock(&inode_sb_list_lock);
}
-static void yaffs_flush_super(struct super_block *sb, int do_checkpoint)
+
+static void yaffs_flush_super(struct super_block *sb, int do_checkpoint, struct list_head * delayed_iput_list)
{
struct yaffs_dev *dev = yaffs_super_to_dev(sb);
if (!dev)
return;
- yaffs_flush_inodes(sb);
+ yaffs_flush_inodes(sb, delayed_iput_list);
yaffs_update_dirty_dirs(dev);
yaffs_flush_whole_cache(dev);
if (do_checkpoint)
@@ -1554,18 +1737,31 @@
request_checkpoint ? "checkpoint requested" : "no checkpoint",
oneshot_checkpoint ? " one-shot" : "");
+ /*
+ * Temporary list for delayed iput of nodes 'locked' by __iget()
+ * during flush operation. This list is created on stack to
+ * ensure 1:1 relation between __iget() and iput() calls
+ */
+ LIST_HEAD(delayed_iput_list);
+
yaffs_gross_lock(dev);
do_checkpoint = ((request_checkpoint && !gc_urgent) ||
oneshot_checkpoint) && !dev->is_checkpointed;
if (sb->s_dirt || do_checkpoint) {
- yaffs_flush_super(sb, !dev->is_checkpointed && do_checkpoint);
+ yaffs_flush_super(sb, !dev->is_checkpointed && do_checkpoint, &delayed_iput_list);
sb->s_dirt = 0;
if (oneshot_checkpoint)
yaffs_auto_checkpoint &= ~4;
}
yaffs_gross_unlock(dev);
+ /*
+ * Call iput() on nodes 'locked' by __iget() by flush operation.
+ * It needs to be done after yaffs_gross_unlock()
+ */
+ yaffs_iput_inodes(&delayed_iput_list);
+
return 0;
}
@@ -1927,9 +2123,16 @@
yaffs_trace(YAFFS_TRACE_OS | YAFFS_TRACE_BACKGROUND,
"yaffs background thread shut down");
+ /*
+ * Temporary list for delayed iput of nodes 'locked' by __iget()
+ * during flush operation. This list is created on stack to
+ * ensure 1:1 relation between __iget() and iput() calls
+ */
+ LIST_HEAD(delayed_iput_list);
+
yaffs_gross_lock(dev);
- yaffs_flush_super(sb, 1);
+ yaffs_flush_super(sb, 1, &delayed_iput_list);
if (yaffs_dev_to_lc(dev)->put_super_fn)
yaffs_dev_to_lc(dev)->put_super_fn(sb);
@@ -1937,6 +2140,13 @@
yaffs_deinitialise(dev);
yaffs_gross_unlock(dev);
+
+ /*
+ * Call iput() on nodes 'locked' by __iget() by flush operation.
+ * It needs to be done after yaffs_gross_unlock()
+ */
+ yaffs_iput_inodes(&delayed_iput_list);
+
mutex_lock(&yaffs_context_lock);
list_del_init(&(yaffs_dev_to_lc(dev)->context_list));
mutex_unlock(&yaffs_context_lock);