--- linux-2.6.9/include/linux/sysctl.h.orig	2009-03-19 22:32:03.058022000 -0700
+++ linux-2.6.9/include/linux/sysctl.h	2009-03-19 22:36:18.724057000 -0700
@@ -184,6 +184,8 @@ enum
 	VM_NFS_WB_LOWMEM=36,    /* limit NFS writes to Lowmem */
 	VM_SWAP_TOKEN_TIMEOUT=37, /* default time for token time out */
 	VM_MAX_WRITEBACK_PAGES=38, /* maximum pages written per writeback loop */
+	VM_KSWAPD_CONGESTION_STALLS=39, /* int: track direct reclaim threads */
+	VM_KSWAPD_DIRECT_RECLAIM_DELAY=40, /* int: msecs to stall threads */
 };
 
 
--- linux-2.6.9/kernel/sysctl.c.orig	2009-03-19 22:32:03.062024000 -0700
+++ linux-2.6.9/kernel/sysctl.c	2009-03-19 22:34:34.889488000 -0700
@@ -71,6 +71,8 @@ extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 extern int percpu_pagelist_fraction;
 extern int wake_balance;
+extern int vm_kswapd_congestion_stalls;
+extern long vm_kswapd_direct_reclaim_delay;
 extern int sysctl_drop_caches;
 extern int write_mapped;
 extern int max_writeback_pages;
@@ -747,6 +749,46 @@ static ctl_table kern_table[] = {
 static int zero;
 static int one_hundred = 100;
 
+/* these are in milliseconds */
+static long kswapd_timer_min = 1;              /* 1 millisecond */
+static long kswapd_timer_max = (1000 * 10);    /* 10 seconds */
+
+/* sysctl strategy function to convert jiffies to milliseconds.  */
+static int kswapd_strategy_ms_jiffies(ctl_table *table, int __user *name,
+               int nlen,
+               void __user *oldval, size_t __user *oldlenp,
+               void __user *newval, size_t newlen, void **context)
+{
+
+       if (oldval) {
+               size_t olen;
+
+               if (oldlenp) {
+                       if (get_user(olen, oldlenp))
+                               return -EFAULT;
+
+                       if (olen != sizeof (long))
+                               return -EINVAL;
+               }
+               if (put_user((*(long *)(table->data) * 1000) / HZ,
+                       (long __user *)oldval) ||
+                       (oldlenp && put_user(sizeof (long), oldlenp)))
+                               return -EFAULT;
+       }
+       if (newval && newlen) {
+               long new;
+
+               if (newlen != sizeof (long))
+                       return -EINVAL;
+
+               if (get_user(new, (long __user *)newval))
+                       return -EFAULT;
+
+               *(long *)(table->data) = (new * HZ) / 1000;
+       }
+       return 1;
+}
+
 
 static ctl_table vm_table[] = {
 	{
@@ -838,6 +880,25 @@ static ctl_table vm_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+        {
+                .ctl_name       = VM_KSWAPD_CONGESTION_STALLS,
+                .procname       = "kswapd_congestion_stalls",
+                .data           = &vm_kswapd_congestion_stalls,
+                .maxlen         = sizeof(int),
+                .mode           = 0444,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = VM_KSWAPD_DIRECT_RECLAIM_DELAY,
+                .procname       = "kswapd_direct_reclaim_delay",
+                .data           = &vm_kswapd_direct_reclaim_delay,
+                .maxlen         = sizeof(vm_kswapd_direct_reclaim_delay),
+                .mode           = 0644,
+                .proc_handler   = &proc_doulongvec_ms_jiffies_minmax,
+                .strategy       = &kswapd_strategy_ms_jiffies,
+               .extra1         = &kswapd_timer_min,
+               .extra2         = &kswapd_timer_max
+        },
 #ifdef CONFIG_HUGETLB_PAGE
 	 {
 		.ctl_name	= VM_HUGETLB_PAGES,
--- linux-2.6.9/mm/vmscan.c.orig	2009-03-19 22:31:53.670411000 -0700
+++ linux-2.6.9/mm/vmscan.c	2009-03-19 22:34:34.901488000 -0700
@@ -130,12 +130,24 @@ struct shrinker {
  * From 0 .. 100.  Higher means more swappy.
  */
 int vm_swappiness = 60;
-
 /*
  * From 0 .. 100.  Higher means more inactive memory.
  */
 int vm_inactive_percent = 0;
 
+/*
+ * vm_kswapd_direct_reclaim_delay is in jiffies so that it can be used
+ * directly by blk_congestion_wait().
+ * It is read/written through /proc/sys/vm/kswapd_direct_reclaim_delay.
+ * Sysctl converts an input value from milliseconds to jiffies.  It displays
+ * kswapd_direct_reclaim_delay's value in milliseconds.
+ * kswapd_direct_reclaim_delay is made visible in milliseconds so that the
+ * user does not need to know the kernel's HZ value and make the conversion
+ * to jiffies.
+ */
+long vm_kswapd_direct_reclaim_delay = HZ / 4;  /* 1/4 sec = 250 milliseconds */
+int vm_kswapd_congestion_stalls;
+
 static long total_memory;
 
 static LIST_HEAD(shrinker_list);
@@ -639,6 +651,7 @@ static void shrink_cache(struct zone *zo
 	spin_unlock_irq(&zone->lru_lock);
 done:
 	pagevec_release(&pvec);
+	cond_resched();
 }
 
 /*
@@ -702,6 +715,7 @@ refill_inactive_zone(struct zone *zone, 
 	}
 	zone->nr_active -= pgmoved;
 	spin_unlock_irq(&zone->lru_lock);
+	cond_resched();
 
 	/*
 	 * `distress' is a measure of how much trouble we're having reclaiming
@@ -781,6 +795,7 @@ refill_inactive_zone(struct zone *zone, 
 			if (buffer_heads_over_limit)
 				pagevec_strip(&pvec);
 			__pagevec_release(&pvec);
+			cond_resched();
 			spin_lock_irq(&zone->lru_lock);
 		}
 	}
@@ -789,6 +804,7 @@ refill_inactive_zone(struct zone *zone, 
 	if (buffer_heads_over_limit) {
 		spin_unlock_irq(&zone->lru_lock);
 		pagevec_strip(&pvec);
+		cond_resched();
 		spin_lock_irq(&zone->lru_lock);
 	}
 
@@ -806,12 +822,14 @@ refill_inactive_zone(struct zone *zone, 
 			pgmoved = 0;
 			spin_unlock_irq(&zone->lru_lock);
 			__pagevec_release(&pvec);
+			cond_resched();
 			spin_lock_irq(&zone->lru_lock);
 		}
 	}
 	zone->nr_active += pgmoved;
 	spin_unlock_irq(&zone->lru_lock);
 	pagevec_release(&pvec);
+	cond_resched();
 
 	mod_page_state_zone(zone, pgrefill, pgscanned);
 	mod_page_state(pgdeactivate, pgdeactivate);
@@ -865,6 +883,19 @@ shrink_zone(struct zone *zone, struct sc
 					(unsigned long)SWAP_CLUSTER_MAX);
 			nr_inactive -= sc->nr_to_scan;
 			shrink_cache(zone, sc);
+
+
+                        /* If not kswapd, let's stall a bit before calling
+                           this over & over again in multiple direct reclaim
+                           threads.  */
+
+                        if (!current_is_kswapd()) {
+                                vm_kswapd_congestion_stalls++;
+                                blk_congestion_wait(WRITE,
+                                        vm_kswapd_direct_reclaim_delay);
+                                break;
+                                }
+
 			if (sc->nr_to_reclaim <= 0)
 				break;
 		/* get whatever we can if inactive is less than vm_inactive_percent of active */ 
@@ -875,6 +906,7 @@ shrink_zone(struct zone *zone, struct sc
                         if (sc->nr_to_reclaim <= 0)
                                 break;
 		}
+		cond_resched();
 	}
 
 	throttle_vm_writeout();
