-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpatch_ck0__ck1.patch
1368 lines (1260 loc) · 36.2 KB
/
patch_ck0__ck1.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
This is the diff of my merge of -ck patchset (-ck0) to the official release on Feb 18 2018 by Con Kolivas (-ck1).
Reading through, it looks like the only differences are:
1. Addition of options related to runqueue-sharing (was an experimental patch not included in 4.14-ck but now is considered stable enough.
2. A possible fix to a condition that I noted could be seen on -ck0 after restoring from suspend/sleep ( I'm running this in Virtualbox, and when the parent OS goes to sleep and then wakes up, sometimes there are panics and high-cpu usage / load henceforth on system until reboot)
3. Some minor cleanups
Of course I would recommend usng the official release -ck1 in lieu of my merge -ck0. This patch is provided if for some reason you need to patch your -ck0 to -ck1
Enjoy!!
- Tim
--- linux-4.15-ck0/arch/x86/Kconfig 2018-02-04 14:48:26.000000000 -0500
+++ linux-4.15-ck1/arch/x86/Kconfig 2018-02-18 01:53:00.000000000 -0500
@@ -1011,6 +1011,79 @@
If unsure say Y here.
+choice
+ prompt "CPU scheduler runqueue sharing"
+ default RQ_MC if SCHED_MUQSS
+ default RQ_NONE
+
+config RQ_NONE
+ bool "No sharing"
+ help
+ This is the default behaviour where the CPU scheduler has one runqueue
+ per CPU, whether it is a physical or logical CPU (hyperthread).
+
+ This can still be enabled runtime with the boot parameter
+ rqshare=none
+
+ If unsure, say N.
+
+config RQ_SMT
+ bool "SMT (hyperthread) siblings"
+ depends on SCHED_SMT && SCHED_MUQSS
+
+ help
+ With this option enabled, the CPU scheduler will have one runqueue
+ shared by SMT (hyperthread) siblings. As these logical cores share
+ one physical core, sharing the runqueue resource can lead to decreased
+ overhead, lower latency and higher throughput.
+
+ This can still be enabled runtime with the boot parameter
+ rqshare=smt
+
+ If unsure, say N.
+
+config RQ_MC
+ bool "Multicore siblings"
+ depends on SCHED_MC && SCHED_MUQSS
+ help
+ With this option enabled, the CPU scheduler will have one runqueue
+ shared by multicore siblings in addition to any SMT siblings.
+ As these physical cores share caches, sharing the runqueue resource
+ will lead to lower latency, but its effects on overhead and throughput
+ are less predictable. As a general rule, 6 or fewer cores will likely
+ benefit from this, while larger CPUs will only derive a latency
+ benefit. If your workloads are primarily single threaded, this will
+ possibly worsen throughput. If you are only concerned about latency
+ then enable this regardless of how many cores you have.
+
+ This can still be enabled runtime with the boot parameter
+ rqshare=mc
+
+ If unsure, say Y.
+
+config RQ_SMP
+ bool "Symmetric Multi-Processing"
+ depends on SMP && SCHED_MUQSS
+ help
+ With this option enabled, the CPU scheduler will have one runqueue
+ shared by all physical CPUs unless they are on separate NUMA nodes.
+ As physical CPUs usually do not share resources, sharing the runqueue
+ will normally worsen throughput but improve latency. If you only
+ care about latency enable this.
+
+ This can still be enabled runtime with the boot parameter
+ rqshare=smp
+
+ If unsure, say N.
+endchoice
+
+config SHARERQ
+ int
+ default 0 if RQ_NONE
+ default 1 if RQ_SMT
+ default 2 if RQ_MC
+ default 3 if RQ_SMP
+
source "kernel/Kconfig.preempt"
config UP_LATE_INIT
--- linux-4.15-ck0/Documentation/scheduler/sched-MuQSS.txt 2018-02-04 14:48:26.000000000 -0500
+++ linux-4.15-ck1/Documentation/scheduler/sched-MuQSS.txt 2018-02-18 01:53:00.000000000 -0500
@@ -66,6 +66,13 @@
next task scheduling decision and task wakeup CPU choice to allow balancing to
happen by virtue of its choices.
+As a further evolution of the design, MuQSS normally configures sharing of
+runqueues in a logical fashion for when CPU resources are shared for improved
+latency and throughput. By default it shares runqueues and locks between
+multicore siblings. Optionally it can be configured to run with sharing of
+SMT siblings only, all SMP packages or no sharing at all. Additionally it can
+be selected at boot time.
+
Design details.
@@ -253,6 +260,25 @@
throughput. Latency will still be bound by rr_interval, but on a per-CPU basis
instead of across the whole system.
+Runqueue sharing.
+
+By default MuQSS chooses to share runqueue resources (specifically the skip
+list and locking) between multicore siblings. It is configurable at build time
+to select between None, SMT, MC and SMP, corresponding to no sharing, sharing
+only between simultaneous mulithreading siblings, multicore siblings, or
+symmetric multiprocessing physical packages. Additionally it can be se at
+bootime with the use of the rqshare parameter. The reason for configurability
+is that some architectures have CPUs with many multicore siblings (>= 16)
+where it may be detrimental to throughput to share runqueues and another
+sharing option may be desirable. Additionally, more sharing than usual can
+improve latency on a system-wide level at the expense of throughput if desired.
+
+The options are:
+none, smt, mc, smp
+
+eg:
+ rqshare=mc
+
Isochronous scheduling:
Isochronous scheduling is a unique scheduling policy designed to provide
--- linux-4.15-ck0/include/linux/sched.h 2018-02-04 14:48:26.000000000 -0500
+++ linux-4.15-ck1/include/linux/sched.h 2018-02-18 01:53:00.000000000 -0500
@@ -9,10 +9,6 @@
#include <uapi/linux/sched.h>
-#ifndef SCHED_DEADLINE
-#define SCHED_DEADLINE 6
-#endif
-
#include <asm/current.h>
#include <linux/pid.h>
--- linux-4.15-ck0/kernel/sched/cputime.c 2018-02-04 14:48:26.000000000 -0500
+++ linux-4.15-ck1/kernel/sched/cputime.c 2018-02-18 01:53:00.000000000 -0500
@@ -5,7 +5,6 @@
#include <linux/static_key.h>
#include <linux/context_tracking.h>
#include <linux/sched/cputime.h>
-#include <linux/cgroup.h>
#include "sched.h"
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
--- linux-4.15-ck0/kernel/sched/idle.c 2018-02-04 14:48:26.000000000 -0500
+++ linux-4.15-ck1/kernel/sched/idle.c 2018-02-18 01:53:00.000000000 -0500
@@ -103,9 +103,6 @@
static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
int next_state)
{
- int cpu = smp_processor_id();
- bool pending = false;
-
/*
* The idle task must be scheduled, it is pointless to go to idle, just
* update no idle residency and return.
@@ -214,6 +211,7 @@
{
int cpu = smp_processor_id();
bool pending = false;
+
/*
* If the arch has a polling bit, we maintain an invariant:
*
@@ -224,14 +222,10 @@
*/
__current_set_polling();
- if (unlikely(softirq_pending(cpu)))
- {
- pending = true;
- }
- else
- {
- tick_nohz_idle_enter();
- }
+ if (unlikely(softirq_pending(cpu)))
+ pending = true;
+ else
+ tick_nohz_idle_enter();
while (!need_resched()) {
check_pgt_cache();
--- linux-4.15-ck0/kernel/sched/MuQSS.c 2018-02-04 14:48:26.000000000 -0500
+++ linux-4.15-ck1/kernel/sched/MuQSS.c 2018-02-18 01:53:00.000000000 -0500
@@ -43,8 +43,8 @@
#include <linux/binfmts.h>
#include <linux/context_tracking.h>
#include <linux/rcupdate_wait.h>
-#include <linux/skip_list.h>
#include <linux/compat.h>
+#include <linux/skip_list.h>
#include <linux/blkdev.h>
#include <linux/kprobes.h>
@@ -55,8 +55,8 @@
#include <linux/profile.h>
#include <linux/security.h>
#include <linux/syscalls.h>
-#include <linux/tick.h>
#include <linux/sched/isolation.h>
+#include <linux/tick.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -113,8 +113,41 @@
void print_scheduler_version(void)
{
- printk(KERN_INFO "MuQSS CPU scheduler v0.162 by Con Kolivas.\n");
+ printk(KERN_INFO "MuQSS CPU scheduler v0.170 by Con Kolivas.\n");
+}
+
+#define RQSHARE_NONE 0
+#define RQSHARE_SMT 1
+#define RQSHARE_MC 2
+#define RQSHARE_SMP 3
+
+/*
+ * This determines what level of runqueue sharing will be done and is
+ * configurable at boot time with the bootparam rqshare =
+ */
+static int rqshare __read_mostly = CONFIG_SHARERQ; /* Default RQSHARE_MC */
+
+static int __init set_rqshare(char *str)
+{
+ if (!strncmp(str, "none", 4)) {
+ rqshare = RQSHARE_NONE;
+ return 0;
+ }
+ if (!strncmp(str, "smt", 3)) {
+ rqshare = RQSHARE_SMT;
+ return 0;
+ }
+ if (!strncmp(str, "mc", 2)) {
+ rqshare = RQSHARE_MC;
+ return 0;
+ }
+ if (!strncmp(str, "smp", 2)) {
+ rqshare = RQSHARE_SMP;
+ return 0;
+ }
+ return 1;
}
+__setup("rqshare=", set_rqshare);
/*
* This is the time all tasks within the same priority round robin.
@@ -149,6 +182,7 @@
*/
static int prio_ratios[NICE_WIDTH] __read_mostly;
+
/*
* The quota handed out to tasks of all priority levels when refilling their
* time_slice.
@@ -158,15 +192,17 @@
return MS_TO_US(rr_interval);
}
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+
#ifdef CONFIG_SMP
-static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp;
-#endif
+/*
+ * Total number of runqueues. Equals number of CPUs when there is no runqueue
+ * sharing but is usually less with SMT/MC sharing of runqueues.
+ */
+static int total_runqueues __read_mostly = 1;
-/* CPUs with isolated domains */
-cpumask_var_t cpu_isolated_map;
+static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp;
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-#ifdef CONFIG_SMP
struct rq *cpu_rq(int cpu)
{
return &per_cpu(runqueues, (cpu));
@@ -309,12 +345,6 @@
return p->on_rq == TASK_ON_RQ_MIGRATING;
}
-static inline int rq_trylock(struct rq *rq)
- __acquires(rq->lock)
-{
- return raw_spin_trylock(&rq->lock);
-}
-
/*
* Any time we have two runqueues locked we use that as an opportunity to
* synchronise niffies to the highest value as idle ticks may have artificially
@@ -341,11 +371,11 @@
__acquires(rq2->lock)
{
if (rq1 < rq2) {
- raw_spin_lock(&rq1->lock);
- raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock(rq1->lock);
+ raw_spin_lock_nested(rq2->lock, SINGLE_DEPTH_NESTING);
} else {
- raw_spin_lock(&rq2->lock);
- raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock(rq2->lock);
+ raw_spin_lock_nested(rq1->lock, SINGLE_DEPTH_NESTING);
}
}
@@ -354,8 +384,8 @@
__acquires(rq2->lock)
{
BUG_ON(!irqs_disabled());
- if (rq1 == rq2) {
- raw_spin_lock(&rq1->lock);
+ if (rq1->lock == rq2->lock) {
+ raw_spin_lock(rq1->lock);
__acquire(rq2->lock); /* Fake it out ;) */
} else
__double_rq_lock(rq1, rq2);
@@ -372,9 +402,9 @@
__releases(rq1->lock)
__releases(rq2->lock)
{
- raw_spin_unlock(&rq1->lock);
- if (rq1 != rq2)
- raw_spin_unlock(&rq2->lock);
+ raw_spin_unlock(rq1->lock);
+ if (rq1->lock != rq2->lock)
+ raw_spin_unlock(rq2->lock);
else
__release(rq2->lock);
}
@@ -387,7 +417,7 @@
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
- do_raw_spin_lock(&rq->lock);
+ do_raw_spin_lock(rq->lock);
}
}
@@ -398,7 +428,7 @@
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
- do_raw_spin_unlock(&rq->lock);
+ do_raw_spin_unlock(rq->lock);
}
preempt_enable();
}
@@ -406,9 +436,9 @@
/* Specially nest trylock an rq */
static inline bool trylock_rq(struct rq *this_rq, struct rq *rq)
{
- if (unlikely(!do_raw_spin_trylock(&rq->lock)))
+ if (unlikely(!do_raw_spin_trylock(rq->lock)))
return false;
- spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_);
+ spin_acquire(rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_);
synchronise_niffies(this_rq, rq);
return true;
}
@@ -416,8 +446,8 @@
/* Unlock a specially nested trylocked rq */
static inline void unlock_rq(struct rq *rq)
{
- spin_release(&rq->lock.dep_map, 1, _RET_IP_);
- do_raw_spin_unlock(&rq->lock);
+ spin_release(rq->lock.dep_map, 1, _RET_IP_);
+ do_raw_spin_unlock(rq->lock);
}
/*
@@ -562,7 +592,7 @@
if (!(p->flags & PF_KTHREAD)) {
struct rq *rq = task_rq(p);
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq->lock);
}
#endif
if (test_tsk_need_resched(p))
@@ -625,7 +655,7 @@
* fix up the runqueue lock - which gets 'carried over' from
* prev into current:
*/
- spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+ spin_acquire(rq->lock.dep_map, 0, 0, _THIS_IP_);
#ifdef CONFIG_SMP
/*
@@ -645,7 +675,7 @@
#else
task_thread_info(prev)->cpu = prev->wake_cpu;
#endif
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq->lock);
raw_spin_lock(&prev->pi_lock);
rq = __task_rq_lock(prev);
@@ -659,8 +689,6 @@
raw_spin_unlock(&prev->pi_lock);
}
#endif
- /* Accurately set nr_running here for load average calculations */
- rq->nr_running = rq->sl->entries + !rq_idle(rq);
rq_unlock(rq);
do_pending_softirq(rq, current);
@@ -755,11 +783,12 @@
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
skiplist_delete(rq->sl, &p->node);
- rq->best_key = rq->node.next[0]->key;
+ rq->best_key = rq->node->next[0]->key;
update_clocks(rq);
if (!(flags & DEQUEUE_SAVE))
sched_info_dequeued(task_rq(p), p);
+ rq->nr_running--;
update_load_avg(rq, flags);
}
@@ -840,9 +869,10 @@
sched_info_queued(rq, p);
randseed = (rq->niffies >> 10) & 0xFFFFFFFF;
skiplist_insert(rq->sl, &p->node, sl_id, p, randseed);
- rq->best_key = rq->node.next[0]->key;
+ rq->best_key = rq->node->next[0]->key;
if (p->in_iowait)
cflags |= SCHED_CPUFREQ_IOWAIT;
+ rq->nr_running++;
update_load_avg(rq, cflags);
}
@@ -1269,7 +1299,7 @@
* task_rq_lock().
*/
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&rq->lock)));
+ lockdep_is_held(rq->lock)));
#endif
trace_sched_migrate_task(p, new_cpu);
@@ -1289,7 +1319,7 @@
* We should only be calling this on a running task if we're
* holding rq lock.
*/
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq->lock);
/*
* We can't change the task_thread_info CPU on a running task
@@ -1523,14 +1553,30 @@
#ifdef CONFIG_SMP
/*
* Check to see if p can run on cpu, and if not, whether there are any online
- * CPUs it can run on instead.
+ * CPUs it can run on instead. This only happens with the hotplug threads that
+ * bring up the CPUs.
*/
+static inline bool sched_other_cpu(struct task_struct *p, int cpu)
+{
+ if (likely(cpumask_test_cpu(cpu, &p->cpus_allowed)))
+ return false;
+ if (p->nr_cpus_allowed == 1) {
+ cpumask_t valid_mask;
+
+ cpumask_and(&valid_mask, &p->cpus_allowed, cpu_online_mask);
+ if (unlikely(cpumask_empty(&valid_mask)))
+ return false;
+ }
+ return true;
+}
+
static inline bool needs_other_cpu(struct task_struct *p, int cpu)
{
- if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed)))
- return true;
- return false;
+ if (cpumask_test_cpu(cpu, &p->cpus_allowed))
+ return false;
+ return true;
}
+
#define cpu_online_map (*(cpumask_t *)cpu_online_mask)
static void try_preempt(struct task_struct *p, struct rq *this_rq)
@@ -1548,7 +1594,7 @@
cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed);
for (i = 0; i < num_possible_cpus(); i++) {
- struct rq *rq = this_rq->rq_order[i];
+ struct rq *rq = this_rq->cpu_order[i];
if (!cpumask_test_cpu(rq->cpu, &tmp))
continue;
@@ -1656,7 +1702,7 @@
static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq->lock);
#ifdef CONFIG_SMP
if (p->sched_contributes_to_load)
@@ -1778,13 +1824,14 @@
cpumask_t valid_mask;
if (p->flags & PF_KTHREAD)
- cpumask_and(&valid_mask, &p->cpus_allowed, cpu_online_mask);
+ cpumask_and(&valid_mask, &p->cpus_allowed, cpu_all_mask);
else
cpumask_and(&valid_mask, &p->cpus_allowed, cpu_active_mask);
if (unlikely(!cpumask_weight(&valid_mask))) {
- /* Hotplug boot threads do this before the CPU is up */
- printk(KERN_INFO "SCHED: No cpumask for %s/%d weight %d\n", p->comm, p->pid, cpumask_weight(&p->cpus_allowed));
+ /* We shouldn't be hitting this any more */
+ printk(KERN_WARNING "SCHED: No cpumask for %s/%d weight %d\n", p->comm,
+ p->pid, cpumask_weight(&p->cpus_allowed));
return cpumask_any(&p->cpus_allowed);
}
return cpumask_any(&valid_mask);
@@ -1812,7 +1859,7 @@
}
for (i = 0; i < num_possible_cpus(); i++) {
- struct rq *other_rq = task_rq(p)->rq_order[i];
+ struct rq *other_rq = task_rq(p)->cpu_order[i];
int entries;
if (!other_rq->online)
@@ -2003,7 +2050,7 @@
WARN_ON_ONCE(p == current))
return;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
/*
@@ -2524,7 +2571,7 @@
* schedule()
* preempt_disable(); // 1
* __schedule()
- * raw_spin_lock_irq(&rq->lock) // 2
+ * raw_spin_lock_irq(rq->lock) // 2
*
* Also, see FORK_PREEMPT_COUNT.
*/
@@ -2638,7 +2685,7 @@
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+ spin_release(rq->lock.dep_map, 1, _THIS_IP_);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
@@ -3383,11 +3430,12 @@
int i, best_entries = 0;
u64 best_key = ~0ULL;
- for (i = 0; i < num_possible_cpus(); i++) {
+ for (i = 0; i < total_runqueues; i++) {
struct rq *other_rq = rq_order(rq, i);
- int entries = other_rq->sl->entries;
skiplist_node *next;
+ int entries;
+ entries = other_rq->sl->entries;
/*
* Check for queued entres lockless first. The local runqueue
* is locked so entries will always be accurate.
@@ -3421,13 +3469,13 @@
}
}
- next = &other_rq->node;
+ next = other_rq->node;
/*
* In interactive mode we check beyond the best entry on other
* runqueues if we can't get the best for smt or affinity
* reasons.
*/
- while ((next = next->next[0]) != &other_rq->node) {
+ while ((next = next->next[0]) != other_rq->node) {
struct task_struct *p;
u64 key = next->key;
@@ -3442,13 +3490,13 @@
continue;
}
+ if (sched_other_cpu(p, cpu)) {
+ if (sched_interactive || !i)
+ continue;
+ break;
+ }
/* Make sure affinity is ok */
if (i) {
- if (needs_other_cpu(p, cpu)) {
- if (sched_interactive)
- continue;
- break;
- }
/* From this point on p is the best so far */
if (locked)
unlock_rq(locked);
@@ -3486,7 +3534,7 @@
if (unlikely(!rq->sl->entries))
return idle;
- edt = rq->node.next[0]->value;
+ edt = rq->node->next[0]->value;
take_task(rq, cpu, edt);
return edt;
}
@@ -3750,12 +3798,15 @@
/*
* Don't reschedule an idle task or deactivated tasks
*/
- if (prev != idle && !deactivate)
+ if (prev == idle)
+ rq->nr_running++;
+ else if (!deactivate)
resched_suitable_idle(prev);
- if (next != idle)
- check_siblings(rq);
- else
+ if (unlikely(next == idle)) {
+ rq->nr_running--;
wake_siblings(rq);
+ } else
+ check_siblings(rq);
rq->nr_switches++;
rq->curr = next;
/*
@@ -5087,6 +5138,7 @@
preempt_schedule_common();
return 1;
}
+ rcu_all_qs();
return 0;
}
EXPORT_SYMBOL(_cond_resched);
@@ -5326,22 +5378,11 @@
return ret;
}
-/**
- * sys_sched_rr_get_interval - return the default timeslice of a process.
- * @pid: pid of the process.
- * @interval: userspace pointer to the timeslice value.
- *
- *
- * Return: On success, 0 and the timeslice is in @interval. Otherwise,
- * an error code.
- */
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
- struct timespec __user *, interval)
+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
{
struct task_struct *p;
unsigned int time_slice;
unsigned long flags;
- struct timespec t;
struct rq *rq;
int retval;
@@ -5363,51 +5404,47 @@
task_rq_unlock(rq, p, &flags);
rcu_read_unlock();
- t = ns_to_timespec(time_slice);
- retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
- return retval;
+ *t = ns_to_timespec64(time_slice);
+ return 0;
out_unlock:
rcu_read_unlock();
return retval;
}
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+ struct timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_timespec64(&t, interval);
+
+ return retval;
+}
+
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
compat_pid_t, pid,
struct compat_timespec __user *, interval)
{
- struct task_struct *p;
- unsigned int time_slice;
- unsigned long flags;
- struct timespec t;
- struct rq *rq;
- int retval;
-
- if (pid < 0)
- return -EINVAL;
-
- retval = -ESRCH;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- if (!p)
- goto out_unlock;
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
-
- rq = task_rq_lock(p, &flags);
- time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
- task_rq_unlock(rq, p, &flags);
-
- rcu_read_unlock();
- t = ns_to_timespec(time_slice);
- retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
- return retval;
-
-out_unlock:
- rcu_read_unlock();
+ if (retval == 0)
+ retval = compat_put_timespec64(&t, interval);
return retval;
}
#endif
@@ -5440,6 +5477,7 @@
show_stack(p, NULL);
put_task_stack(p);
}
+EXPORT_SYMBOL_GPL(sched_show_task);
static inline bool
state_filter_match(unsigned long state_filter, struct task_struct *p)
@@ -5522,7 +5560,7 @@
* Because __kthread_bind() calls this on blocked tasks without
* holding rq->lock.
*/
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq->lock);
}
}
@@ -5558,7 +5596,7 @@
unsigned long flags;
raw_spin_lock_irqsave(&idle->pi_lock, flags);
- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq->lock);
idle->last_ran = rq->niffies;
time_slice_expired(idle, rq);
idle->state = TASK_RUNNING;
@@ -5588,7 +5626,7 @@
rq->curr = rq->idle = idle;
idle->on_rq = TASK_ON_RQ_QUEUED;
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq->lock);
raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
@@ -5662,7 +5700,7 @@
int i, cpu = smp_processor_id();
struct sched_domain *sd;
- if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
+ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
return cpu;
rcu_read_lock();
@@ -5671,15 +5709,16 @@
if (cpu == i)
continue;
- if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
+ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
+ cpu = i;
cpu = i;
goto unlock;
}
}
}
- if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
- cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
+ cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
unlock:
rcu_read_unlock();
return cpu;
@@ -6389,16 +6428,12 @@
void __init sched_init_smp(void)
{
+ struct rq *rq, *other_rq, *leader;
struct sched_domain *sd;
- int cpu, other_cpu;
+ int cpu, other_cpu, i;
#ifdef CONFIG_SCHED_SMT
bool smt_threads = false;
#endif
- cpumask_var_t non_isolated_cpus;
- struct rq *rq;
-
- alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-
sched_init_numa();
/*
@@ -6408,15 +6443,11 @@
*/
mutex_lock(&sched_domains_mutex);
sched_init_domains(cpu_active_mask);
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
- if (cpumask_empty(non_isolated_cpus))
- cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
/* Move init over to a non-isolated CPU */
- if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
BUG();
- free_cpumask_var(non_isolated_cpus);
mutex_lock(&sched_domains_mutex);
local_irq_disable();
@@ -6437,8 +6468,18 @@
for_each_domain(cpu, sd) {
if (sd->level > SD_LV_MC)
continue;
+ leader = NULL;
/* Set locality to local node if not already found lower */
for_each_cpu(other_cpu, sched_domain_span(sd)) {
+ if (rqshare == RQSHARE_SMP) {
+ other_rq = cpu_rq(other_cpu);
+
+ /* Set the smp_leader to the first CPU */
+ if (!leader)
+ leader = rq;
+ other_rq->smp_leader = leader;
+ }
+
if (rq->cpu_locality[other_cpu] > 3)
rq->cpu_locality[other_cpu] = 3;
}
@@ -6449,38 +6490,48 @@
* siblings of its own allowing mixed topologies.
*/
#ifdef CONFIG_SCHED_MC
- for_each_cpu(other_cpu, core_cpumask(cpu)) {
- if (rq->cpu_locality[other_cpu] > 2)
- rq->cpu_locality[other_cpu] = 2;
- }
+ leader = NULL;
if (cpumask_weight(core_cpumask(cpu)) > 1) {
cpumask_copy(&rq->core_mask, core_cpumask(cpu));
cpumask_clear_cpu(cpu, &rq->core_mask);
+ for_each_cpu(other_cpu, core_cpumask(cpu)) {
+ if (rqshare == RQSHARE_MC) {
+ other_rq = cpu_rq(other_cpu);
+
+ /* Set the mc_leader to the first CPU */
+ if (!leader)
+ leader = rq;
+ other_rq->mc_leader = leader;
+ }
+ if (rq->cpu_locality[other_cpu] > 2)
+ rq->cpu_locality[other_cpu] = 2;
+ }
rq->cache_idle = cache_cpu_idle;
}
#endif
#ifdef CONFIG_SCHED_SMT
+ leader = NULL;
if (cpumask_weight(thread_cpumask(cpu)) > 1) {
cpumask_copy(&rq->thread_mask, thread_cpumask(cpu));
cpumask_clear_cpu(cpu, &rq->thread_mask);
- for_each_cpu(other_cpu, thread_cpumask(cpu))
- rq->cpu_locality[other_cpu] = 1;
+ for_each_cpu(other_cpu, thread_cpumask(cpu)) {
+ if (rqshare == RQSHARE_SMT) {
+ other_rq = cpu_rq(other_cpu);
+
+ /* Set the smt_leader to the first CPU */
+ if (!leader)
+ leader = rq;
+ other_rq->smt_leader = leader;
+ }
+ if (rq->cpu_locality[other_cpu] > 1)
+ rq->cpu_locality[other_cpu] = 1;
+ }
rq->siblings_idle = siblings_cpu_idle;
smt_threads = true;
}
#endif
}
- for_each_possible_cpu(cpu) {
- int total_cpus = 1, locality;
- rq = cpu_rq(cpu);
- for (locality = 1; locality <= 4; locality++) {
- for_each_possible_cpu(other_cpu) {
- if (rq->cpu_locality[other_cpu] == locality)
- rq->rq_order[total_cpus++] = cpu_rq(other_cpu);
- }
- }
- }
#ifdef CONFIG_SMT_NICE
if (smt_threads) {
check_siblings = &check_smt_siblings;
@@ -6502,6 +6553,149 @@
}
}
+ for_each_online_cpu(cpu) {
+ rq = cpu_rq(cpu);
+ leader = rq->smp_leader;
+
+ rq_lock(rq);
+ if (leader && rq != leader) {
+ printk(KERN_INFO "Sharing SMP runqueue from CPU %d to CPU %d\n",
+ leader->cpu, rq->cpu);
+ kfree(rq->node);
+ kfree(rq->sl);
+ kfree(rq->lock);
+ rq->node = leader->node;
+ rq->sl = leader->sl;
+ rq->lock = leader->lock;
+ barrier();
+ /* To make up for not unlocking the freed runlock */
+ preempt_enable();
+ } else
+ rq_unlock(rq);
+ }
+
+#ifdef CONFIG_SCHED_MC
+ for_each_online_cpu(cpu) {
+ rq = cpu_rq(cpu);
+ leader = rq->mc_leader;
+
+ rq_lock(rq);
+ if (leader && rq != leader) {
+ printk(KERN_INFO "Sharing MC runqueue from CPU %d to CPU %d\n",
+ leader->cpu, rq->cpu);
+ kfree(rq->node);
+ kfree(rq->sl);
+ kfree(rq->lock);
+ rq->node = leader->node;
+ rq->sl = leader->sl;
+ rq->lock = leader->lock;
+ barrier();
+ /* To make up for not unlocking the freed runlock */
+ preempt_enable();