From c86ae1c4ad0183cf0987ec2da70b309aea01617b Mon Sep 17 00:00:00 2001 From: Bernhard Urban-Forster Date: Thu, 31 Oct 2019 22:48:07 +0100 Subject: [PATCH] [mono] limit DegreeOfParallelism to 16 We started to see the `System.Core-xunit` step on CI to hit the timeout of 15 minutes with Linux/ARM64. That was weird, because the step used to be completed in around two minutes. With my local device (jetson board) I wasn't able to reproduce it either; it took around 100s there. We then realized it's specific to the new `taishan` CI machines, which are equipped with 64 cores. Hardcoding `mono_cpu_count` to return 16 restored the performance, however that isn't a viable fix. Limiting `DefaultDegreeOfParallelism` to 16 fixes it, which is less extreme than limiting `mono_cpu_count ()`, still not ideal though. It seems to boil down to the fact that our non-netcore threadpool implementation doesn't handle a large number of cores well. `repro.cs`, extracted from here https://github.com/dotnet/corefx/blob/a9b91e205a8794327a028cb4b29953127f0f194c/src/System.Linq.Parallel/tests/QueryOperators/ConcatTests.cs#L145-L154 ```csharp using System; using System.Linq; using System.Collections.Generic; using System.Threading; public class Repro { public static void Main (string []args) { const int ElementCount = 2048; ParallelQuery leftQuery = ParallelEnumerable.Range(0, ElementCount / 4).Union(ParallelEnumerable.Range(ElementCount / 4, ElementCount / 4)); ParallelQuery rightQuery = ParallelEnumerable.Range(2 * ElementCount / 4, ElementCount / 4).Union(ParallelEnumerable.Range(3 * ElementCount / 4, ElementCount / 4)); var results = new HashSet(leftQuery.Concat(rightQuery)); Console.WriteLine ("results.Count=" + results.Count + ", ElementCount=" + ElementCount); } } ``` Before fix: ```console $ time ./mono/mini/mono-sgen repro.exe results.Count=2048, ElementCount=2048 real 0m5.846s user 0m0.344s sys 0m1.929s $ make -C mcs/class/System.Core run-xunit-test [...] === TEST EXECUTION SUMMARY === net_4_x_System.Core_xunit-test Total: 48774, Errors: 0, Failed: 0, Skipped: 6, Time: 536.005s ``` With this fix: ```console $ time ./mono/mini/mono-sgen repro.exe results.Count=2048, ElementCount=2048 real 0m1.247s user 0m0.206s sys 0m0.225s $ make -C mcs/class/System.Core run-xunit-test [...] === TEST EXECUTION SUMMARY === net_4_x_System.Core_xunit-test Total: 48774, Errors: 0, Failed: 0, Skipped: 6, Time: 131.143s ``` --- .../src/System/Linq/Parallel/Scheduling/Scheduling.cs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/System.Linq.Parallel/src/System/Linq/Parallel/Scheduling/Scheduling.cs b/src/System.Linq.Parallel/src/System/Linq/Parallel/Scheduling/Scheduling.cs index 945994e06ebd..4c9a89c1da05 100644 --- a/src/System.Linq.Parallel/src/System/Linq/Parallel/Scheduling/Scheduling.cs +++ b/src/System.Linq.Parallel/src/System/Linq/Parallel/Scheduling/Scheduling.cs @@ -47,8 +47,13 @@ internal static class Scheduling // The number of milliseconds before we assume a producer has been zombied. internal const int ZOMBIED_PRODUCER_TIMEOUT = Timeout.Infinite; +#if MONO + /* limit to degree of 16 to avoid too much contention */ + internal const int MAX_SUPPORTED_DOP = 16; +#else // The largest number of partitions that PLINQ supports. internal const int MAX_SUPPORTED_DOP = 512; +#endif //-----------------------------------------------------------------------------------