rtgang-v4.4.patch

diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h
index ede692ffa32e..c6ac61c8b18e 100644
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -281,6 +281,7 @@
 #define __NR_remap_file_pages		(__NR_SYSCALL_BASE+253)
 					/* 254 for set_thread_area */
 					/* 255 for get_thread_area */
+#define __NR_bwlock			(__NR_SYSCALL_BASE+255)
 #define __NR_set_tid_address		(__NR_SYSCALL_BASE+256)
 #define __NR_timer_create		(__NR_SYSCALL_BASE+257)
 #define __NR_timer_settime		(__NR_SYSCALL_BASE+258)
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index ac368bb068d1..09fdb29d6710 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -264,7 +264,7 @@
 		CALL(ABI(sys_epoll_wait, sys_oabi_epoll_wait))
 		CALL(sys_remap_file_pages)
 		CALL(sys_ni_syscall)	/* sys_set_thread_area */
-/* 255 */	CALL(sys_ni_syscall)	/* sys_get_thread_area */
+/* 255 */	CALL(sys_bwlock)	/* sys_get_thread_area */
 		CALL(sys_set_tid_address)
 		CALL(sys_timer_create)
 		CALL(sys_timer_settime)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ce0f61dcd887..a3f155c73714 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1405,6 +1405,9 @@ struct task_struct {
 #endif
 	struct sched_dl_entity dl;
 
+	/* Throttling Related Fields */
+	int corun_threshold_events;
+
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	/* list of struct preempt_notifier: */
 	struct hlist_head preempt_notifiers;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c2b66a277e98..4268f06d789a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -871,7 +871,7 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
 				      const struct iovec __user *rvec,
 				      unsigned long riovcnt,
 				      unsigned long flags);
-
+asmlinkage long sys_bwlock(pid_t pid, int cte);
 asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
 			 unsigned long idx1, unsigned long idx2);
 asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 1324b0292ec2..db4738aa40e8 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -662,6 +662,9 @@ __SC_COMP(__NR_recvmmsg, sys_recvmmsg, compat_sys_recvmmsg)
  */
 #define __NR_arch_specific_syscall 244
 
+#define __NR_bwlock 255
+__SYSCALL(__NR_bwlock, sys_bwlock)
+
 #define __NR_wait4 260
 __SC_COMP(__NR_wait4, sys_wait4, compat_sys_wait4)
 #define __NR_prlimit64 261
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 20253dbc8610..3c8223ecace9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -606,6 +606,34 @@ void resched_cpu(int cpu)
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
+/*
+ * The purpose of this function is to force rescheduling of a target cpu under
+ * all circumstances. For this reason, this function does not acquire the
+ * target CPU's rq lock and sends a rescheduling interrupt without protection
+ * if need be. It is used exclusively in RT-Gang related code.
+ */
+void resched_cpu_force (int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct task_struct *curr = rq->curr;
+
+	if (test_tsk_need_resched(curr))
+		return;
+
+	cpu = cpu_of(rq);
+
+	if (cpu == smp_processor_id()) {
+		set_tsk_need_resched(curr);
+		set_preempt_need_resched();
+		return;
+	}
+
+	if (set_nr_and_not_polling(curr))
+		smp_send_reschedule(cpu);
+	else
+		trace_sched_wake_idle_without_ipi(cpu);
+}
+
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ_COMMON
 /*
@@ -3054,33 +3082,31 @@ static inline void schedule_debug(struct task_struct *prev)
 static inline struct task_struct *
 pick_next_task(struct rq *rq, struct task_struct *prev)
 {
-	const struct sched_class *class = &fair_sched_class;
+	const struct sched_class *class;
 	struct task_struct *p;
-
-	/*
-	 * Optimization: we know that if all tasks are in
-	 * the fair class we can call that function directly:
-	 */
-	if (likely(prev->sched_class == class &&
-		   rq->nr_running == rq->cfs.h_nr_running)) {
-		p = fair_sched_class.pick_next_task(rq, prev);
-		if (unlikely(p == RETRY_TASK))
-			goto again;
-
-		/* assumes fair_sched_class->next == idle_sched_class */
-		if (unlikely(!p))
-			p = idle_sched_class.pick_next_task(rq, prev);
-
-		return p;
-	}
+	bool skip_retry_flag = false;
 
 again:
 	for_each_class(class) {
 		p = class->pick_next_task(rq, prev);
 		if (p) {
-			if (unlikely(p == RETRY_TASK))
+			if (p == BLOCK_TASK) {
+				/*
+				 * Do not honor the RETRY request from the fair
+				 * class since blocking of task in RT class is
+				 * being done on purpose.
+				 */
+				skip_retry_flag = true;
+				continue;
+			}
+
+			if (p != RETRY_TASK)
+				/* We have a valid task. Return it! */
+				return p;
+
+			if (!skip_retry_flag && p == RETRY_TASK)
+				/* Restart the task picking loop */
 				goto again;
-			return p;
 		}
 	}
 
@@ -4202,6 +4228,34 @@ err_size:
 	return -E2BIG;
 }
 
+/*
+ * sys_bwlock - Memory bandwidth control lock. Provides exclusive access to
+ * main memory to the holder. Holder must be a real-time task
+ *
+ * @pid	: pid of the process which wants to hold bandwidth lock
+ * @cte : Safe memory usage threshold for corunning tasks
+ */
+SYSCALL_DEFINE2(bwlock, pid_t, pid, int, cte)
+{
+	struct task_struct *p;
+
+	/* Obtain the task structure associated with the process
+	   referenced by pid */
+	if (pid == 0 || current->pid == pid)
+		p = current;
+	else
+		p = find_process_by_pid (pid);
+
+	/* Process does not exist or it is not a real-time process */
+	if (!p || !rt_task (p))
+		return -1;
+
+	p->corun_threshold_events = cte;
+
+	/* Return with success */
+	return 0;
+}
+
 /**
  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
  * @pid: the pid in question.
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 69631fa46c2f..2290629b3f4c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -6,6 +6,14 @@
 SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
 
 /*
+ * Enable real-time gang scheduling framework (RT-Gang). RT-Gang allows
+ * execution of a single (multi-threaded) real-time task (i.e., gang) at any
+ * giving time across all system cores.
+ * NOTE: This feature is disabled by default.
+ */
+SCHED_FEAT(RT_GANG_LOCK, false)
+
+/*
  * Place new tasks ahead so that they do not starve already running
  * tasks
  */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8ec86abe0ea1..bb45581fd8f2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -8,6 +8,11 @@
 #include <linux/slab.h>
 #include <linux/irq_work.h>
 
+rt_gang_lock_t	rt_gang_lock;
+rt_gang_lock_t	*rt_glock = &rt_gang_lock;
+int be_mem_threshold = SYS_MAX_LLC_EVENTS;
+EXPORT_SYMBOL(be_mem_threshold);
+
 int sched_rr_timeslice = RR_TIMESLICE;
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -1443,7 +1448,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 	return next;
 }
 
-static struct task_struct *_pick_next_task_rt(struct rq *rq)
+static struct task_struct *__peek_next_task_rt(struct rq *rq)
 {
 	struct sched_rt_entity *rt_se;
 	struct task_struct *p;
@@ -1456,7 +1461,6 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 	} while (rt_rq);
 
 	p = rt_task_of(rt_se);
-	p->se.exec_start = rq_clock_task(rq);
 
 	return p;
 }
@@ -1491,19 +1495,70 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 	 * We may dequeue prev's rt_rq in put_prev_task().
 	 * So, we update time before rt_nr_running check.
 	 */
-	if (prev->sched_class == &rt_sched_class)
+	if (prev->sched_class == &rt_sched_class) {
 		update_curr_rt(rq);
 
+		/*
+		 * If 'prev' is a member of the current RT gang, update the
+		 * locked_cores mask and release the RT gang lock if necessary.
+		 */
+		if (sched_feat (RT_GANG_LOCK)) {
+			raw_spin_lock (&rt_glock->lock);
+			if (rt_glock->lock_held)
+				try_glock_release (prev);
+			raw_spin_unlock (&rt_glock->lock);
+		}
+	}
+
 	if (!rt_rq->rt_queued)
 		return NULL;
 
-	put_prev_task(rq, prev);
+	p = __peek_next_task_rt (rq);
+
+	/* Do not apply RT gang to high-priority kernel threads */
+	if (sched_feat (RT_GANG_LOCK) && p->mm &&
+			(p->prio > RT_SYS_PRIO_THRESHOLD)) {
+		raw_spin_lock (&rt_glock->lock);
+		if (!rt_glock->lock_held) {
+			/* No RT gang exist currently; begin a new gang */
+			BUG_ON (cpumask_weight (rt_glock->locked_cores) != 0);
+			BUG_ON (cpumask_weight (rt_glock->blocked_cores) != 0);
+
+			TRACER (p, "Acquiring lock");
+			rt_glock->prio = p->prio;
+			gang_lock_cpu (p);
+			rt_glock->lock_held = true;
+			update_mem_threshold (p);
+		} else {
+			BUG_ON (cpumask_weight (rt_glock->locked_cores) == 0);
+			if (rt_glock->prio > p->prio) {
+				/* 'p' has higher priority; preempt */
+				TRACER (p, "Preempted by gang");
+				do_gang_preemption ();
+				rt_glock->prio = p->prio;
+				gang_lock_cpu (p);
+				update_mem_threshold (p);
+			} else if (p->prio == rt_glock->prio) {
+				/* 'p' is part of the current RT gang */
+				gang_lock_cpu (p);
+			} else {
+				/* 'p' has lower priority; blocked */
+				TRACER (p, "Blocking gang");
+				cpumask_set_cpu (smp_processor_id (),
+						rt_glock->blocked_cores);
 
-	p = _pick_next_task_rt(rq);
+				raw_spin_unlock (&rt_glock->lock);
+				return BLOCK_TASK;
+			}
+		}
+		raw_spin_unlock (&rt_glock->lock);
+	}
+
+	put_prev_task (rq, prev);
+	p->se.exec_start = rq_clock_task (rq);
 
 	/* The running task is never eligible for pushing */
 	dequeue_pushable_task(rq, p);
-
 	queue_push_tasks(rq);
 
 	return p;
@@ -2115,6 +2170,8 @@ void __init init_sched_rt_class(void)
 		zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
 					GFP_KERNEL, cpu_to_node(i));
 	}
+
+	INIT_GANG_LOCK ();
 }
 #endif /* CONFIG_SMP */
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0517abd7dd73..0602ccb63333 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1168,6 +1168,7 @@ static const u32 prio_to_wmult[40] = {
 #define DEQUEUE_SAVE		0x02
 
 #define RETRY_TASK		((void *)-1UL)
+#define BLOCK_TASK		((void *)-2UL)
 
 struct sched_class {
 	const struct sched_class *next;
@@ -1298,6 +1299,7 @@ extern void init_sched_fair_class(void);
 
 extern void resched_curr(struct rq *rq);
 extern void resched_cpu(int cpu);
+extern void resched_cpu_force(int cpu);
 
 extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
@@ -1783,3 +1785,144 @@ static inline void account_reset_rq(struct rq *rq)
 	rq->prev_steal_time_rq = 0;
 #endif
 }
+
+/*
+ * GANG SCHEDULING RELATED DECLARATIONS
+ */
+typedef struct rt_gang_lock {
+	raw_spinlock_t		lock;
+	bool			lock_held;
+	cpumask_var_t		locked_cores;
+	cpumask_var_t		blocked_cores;
+	int			prio;
+	struct task_struct*	gthreads [NR_CPUS];
+} rt_gang_lock_t;
+
+extern int be_mem_threshold;
+extern rt_gang_lock_t	*rt_glock;
+
+#define RT_SYS_PRIO_THRESHOLD		(50)
+#define INIT_GANG_LOCK()						\
+do {									\
+	int i = 0;							\
+	raw_spin_lock_init (&rt_glock->lock);				\
+	rt_glock->lock_held = false;					\
+	zalloc_cpumask_var (&rt_glock->locked_cores, GFP_KERNEL);	\
+	zalloc_cpumask_var (&rt_glock->blocked_cores, GFP_KERNEL);	\
+	rt_glock->prio = INT_MAX;					\
+	for (; i < NR_CPUS; i++)					\
+		rt_glock->gthreads [i] = NULL;				\
+} while (0);
+
+/*
+ * Default memory usage threshold for best-effort tasks. On a system with
+ * 64-Byte cache line size, this equals 100 GBytes/sec i.e., no throttling.
+ */
+#define	SYS_MAX_LLC_EVENTS		(1638400)
+
+/*
+ * The following budget (for using main memory) is applied by default to all
+ * best-effort tasks on a per-core basis while a real-time task is executing.
+ * On a system with 64-Byte cache line size, the value specified below comes
+ * out to be 100 MBytes/sec.
+ * The goal here is to throttle 'aggressively' by default so that the
+ * best-effort tasks are not able to interfere with the real-time tasks.
+ */
+#define	SYS_DEFAULT_LLC_EVENTS		(1638)
+
+#undef RT_GANG_DEBUG
+#ifdef RT_GANG_DEBUG
+#define TRACER(task, msg)						\
+	printk (KERN_INFO "[G:] core=%d task=%-20s prio=%-3d"		\
+			"mm=%p pid=%-6d tcpu=%d | %s\n",		\
+			smp_processor_id (), (task)->comm,		\
+			(task)->prio, (task)->mm, (task)->pid,		\
+			task_cpu (task), msg);
+#else
+#define TRACER(task, msg)
+#endif
+
+static inline void gang_lock_cpu (struct task_struct *thread)
+{
+	int cpu = smp_processor_id ();
+
+	TRACER (thread, "Adding new gang member");
+	cpumask_set_cpu (cpu, rt_glock->locked_cores);
+	rt_glock->gthreads [cpu] = thread;
+
+	return;
+}
+
+static inline void resched_cpus (cpumask_var_t mask)
+{
+	int cpu;
+	int this_cpu = smp_processor_id ();
+
+	for_each_cpu (cpu, mask) {
+		if (cpu == this_cpu)
+			continue;
+
+		resched_cpu_force (cpu);
+	}
+	return;
+}
+
+static inline void do_gang_preemption (void)
+{
+	int cpu;
+	int this_cpu = smp_processor_id ();
+
+	for_each_cpu (cpu, rt_glock->locked_cores) {
+		WARN_ON (rt_glock->gthreads [cpu] == NULL);
+		TRACER (rt_glock->gthreads [cpu], "Preempting thread");
+		rt_glock->gthreads [cpu] = NULL;
+
+		if (cpu != this_cpu)
+			resched_cpu_force (cpu);
+	}
+
+	cpumask_clear (rt_glock->locked_cores);
+
+	return;
+}
+
+static inline void try_glock_release (struct task_struct *thread)
+{
+	int cpu;
+
+	WARN_ON (cpumask_weight (rt_glock->locked_cores) == 0);
+
+	/*
+	 * Release RT-Gang lock of 'prev' task on all cores it may have ran on.
+	 * Migrated tasks can hold lock on multiple cores.
+	 */
+	for_each_cpu (cpu, rt_glock->locked_cores) {
+		if (rt_glock->gthreads [cpu] == thread) {
+			TRACER (thread, "Releasing lock");
+			WARN_ON (!rt_prio (thread->prio));
+			cpumask_clear_cpu (cpu, rt_glock->locked_cores);
+		}
+	}
+
+	if (cpumask_weight (rt_glock->locked_cores) == 0) {
+		/* RT-Gang lock is now free. Reschedule blocked cores. */
+		TRACER (thread, "Lock free");
+		rt_glock->prio = INT_MAX;
+		rt_glock->lock_held = false;
+		be_mem_threshold = SYS_MAX_LLC_EVENTS;
+		resched_cpus (rt_glock->blocked_cores);
+		cpumask_clear (rt_glock->blocked_cores);
+	}
+
+	return;
+}
+
+static inline void update_mem_threshold (struct task_struct *thread)
+{
+	if (thread->corun_threshold_events)
+		be_mem_threshold = thread->corun_threshold_events;
+	else
+		be_mem_threshold = SYS_DEFAULT_LLC_EVENTS;
+
+	return;
+}