JIT: loop interchange optimization #4358

ghost · 2015-07-08T18:23:47Z

Source: http://stackoverflow.com/a/11303693/863980

From the link above it is evident that the following two methods are logically equal, due to Loop-Invariant code motion:

public static void UnHoisted(ref int[] data, int arraySize)
{
    long sum = 0;

    for (int i = 0; i < 100000; ++i)
    {
        for (int j = 0; j < arraySize; ++j)
        {
            if (data[j] >= 128)
                sum += data[j];
        }
    }
}

and

public static void Hoisted(ref int[] data, int arraySize)
{
    long sum = 0;

    for (int j = 0; j < arraySize; ++j)
    {
        if (data[j] >= 128)
        {
            sum += data[j] * 100000;
        }
    }
}

The driver (Main) method looks like:

public static void Main (string[] args)
{
    const int arraySize = 32768;
    int[] data = new int[arraySize];
    Random random = new Random();

    for (int c = 0; c < arraySize; ++c)
        data[c] = random.Next() % 256;

    UnHoisted(ref data, arraySize);
    // Hoisted(ref data, arraySize);
}

Diffing the produced disasm of UnHoisted and Hoisted methods results in:

diff --git a/c:/temp/UnHoisted.txt b/c:/temp/Hoisted.txt
index 9f25a6b..e4cbf57 100644
--- a/c:/temp/UnHoisted.txt
+++ b/c:/temp/Hoisted.txt
@@ -1,79 +1,66 @@
-; Assembly listing for method Program:UnHoisted(byref,int)
+; Assembly listing for method Program:Hoisted(byref,int)
 ; Emitting BLENDED_CODE for X64 CPU with AVX
 ; optimized code
 ; rsp based frame
 ; fully interruptible
 ; Final local variable assignments
 ;
-;  V00 arg0         [V00,T04] (  4,  18  )   byref  ->  rcx        
-;  V01 arg1         [V01,T05] (  3,  18  )     int  ->  rdx        
-;  V02 loc0         [V02,T06] (  3,  17  )    long  ->  rax        
-;  V03 loc1         [V03,T07] (  4,  13  )     int  ->   r8        
-;  V04 loc2         [V04,T00] (  8,  66  )     int  ->  r10        
-;  V05 loc3         [V05,T01] (  6,  64  )    bool  ->   r9        
-;  V06 tmp0         [V06,T02] (  3,  48  )     ref  ->   r9        
-;  V07 tmp1         [V07,T03] (  3,  48  )     ref  ->   r9        
-;  V08 OutArgs      [V08    ] (  1,   1  )  lclBlk (32) [rsp+0x00]  
+;  V00 arg0         [V00,T05] (  4,   6  )   byref  ->  rcx        
+;  V01 arg1         [V01,T04] (  3,  10  )     int  ->  rdx        
+;  V02 loc0         [V02,T06] (  3,   5  )    long  ->  rax        
+;  V03 loc1         [V03,T00] (  8,  21  )     int  ->   r8        
+;  V04 loc2         [V04,T01] (  4,  20  )    bool  ->   r9        
+;  V05 tmp0         [V05,T02] (  3,  12  )     ref  ->   r9        
+;  V06 tmp1         [V06,T03] (  3,  12  )     ref  ->   r9        
+;  V07 OutArgs      [V07    ] (  1,   1  )  lclBlk (32) [rsp+0x00]  
 ;
 ; Lcl frame size = 40

-G_M4700_IG01:
+G_M9639_IG01:
        4883EC28             sub      rsp, 40

-G_M4700_IG02:
+G_M9639_IG02:
        33C0                 xor      rax, rax
        4533C0               xor      r8d, r8d
-       EB5D                 jmp      SHORT G_M4700_IG07
+       EB49                 jmp      SHORT G_M9639_IG05

-G_M4700_IG03:
-       4533D2               xor      r10d, r10d
-       EB45                 jmp      SHORT G_M4700_IG06
-
-G_M4700_IG04:
+G_M9639_IG03:
        4C8B09               mov      r9, gword ptr [rcx]
-       458B5908             mov      r11d, dword ptr [r9+8]
-       453BD3               cmp      r10d, r11d
-       7365                 jae      SHORT G_M4700_IG09
-       4D63DA               movsxd   r11d, r10d
-       43817C991080000000   cmp      dword ptr [r9+4*r11+16], 128
+       458B5108             mov      r10d, dword ptr [r9+8]
+       453BC2               cmp      r8d, r10d
+       7352                 jae      SHORT G_M9639_IG07
+       4D63D0               movsxd   r10d, r8d
+       43817C911080000000   cmp      dword ptr [r9+4*r10+16], 128
        410F9CC1             setl     r9b
        450FB6C9             movzx    r9, r9b
        4585C9               test     r9d, r9d
-       751D                 jne      SHORT G_M4700_IG05
+       7521                 jne      SHORT G_M9639_IG04
        4C8B09               mov      r9, gword ptr [rcx]
-       458B5908             mov      r11d, dword ptr [r9+8]
-       453BD3               cmp      r10d, r11d
-       7340                 jae      SHORT G_M4700_IG09
-       4D63DA               movsxd   r11d, r10d
-       478B4C9910           mov      r9d, dword ptr [r9+4*r11+16]
+       458B5108             mov      r10d, dword ptr [r9+8]
+       453BC2               cmp      r8d, r10d
+       732D                 jae      SHORT G_M9639_IG07
+       4D63D0               movsxd   r10d, r8d
+       47694C9110A0860100   imul     r9d, dword ptr [r9+4*r10+16], 0x186A0
        4D63C9               movsxd   r9d, r9d
        4903C1               add      rax, r9

-G_M4700_IG05:
-       41FFC2               inc      r10d
-
-G_M4700_IG06:
-       443BD2               cmp      r10d, edx
-       410F9CC1             setl     r9b
-       450FB6C9             movzx    r9, r9b
-       4585C9               test     r9d, r9d
-       75AE                 jne      SHORT G_M4700_IG04
+G_M9639_IG04:
        41FFC0               inc      r8d

-G_M4700_IG07:
-       4181F8A0860100       cmp      r8d, 0x186A0
+G_M9639_IG05:
+       443BC2               cmp      r8d, edx
        410F9CC1             setl     r9b
        450FB6C9             movzx    r9, r9b
        4585C9               test     r9d, r9d
-       7592                 jne      SHORT G_M4700_IG03
+       75AA                 jne      SHORT G_M9639_IG03

-G_M4700_IG08:
+G_M9639_IG06:
        4883C428             add      rsp, 40
        C3                   ret      

-G_M4700_IG09:
-       E8D152265F           call     CORINFO_HELP_RNGCHKFAIL
+G_M9639_IG07:
+       E8E952245F           call     CORINFO_HELP_RNGCHKFAIL
        CC                   int3     

-; Total bytes of code 132, prolog size 4 for method Program:UnHoisted(byref,int)
+; Total bytes of code 108, prolog size 4 for method Program:Hoisted(byref,int)
 ; ============================================================

The execution time of UnHoisted() is much higher than that of Hoisted().

Do we have a starting point for this kind of optimization; such as some basic heuristics in place which needs to be enhanced or would it require an implementation from ground-up?

Cc @mikedn, @omariom, @cmckinsey

category:cq
theme:loop-opt
skill-level:expert
cost:extra-large

The text was updated successfully, but these errors were encountered:

mikedn · 2015-07-08T20:04:56Z

Have you seen a compiler which implements the optimization described in that StackOverflow post? 😄

ghost · 2015-07-08T20:25:47Z

@mikedn, ouch! like the Japanese spirit says, "if nobody has done it, I must do it" ㊗️

[Unwarranted] I think while Intel's compiler is good at loop-interchange optimization, it is also leading in LIH arena, but I am not sure what kind of areas are fuzzy and most challenging (research-wise).

mikedn · 2015-07-08T21:08:02Z

Yes, Intel's compiler does this optimization, all others vectorize the inner loop and don't do anything else interesting. The interesting thing about the Intel compiler is that it does it only with /O2, not with /O1. That may imply that the compiler wasn't specifically doing this optimization but that the result was the fortunate result of other optimizations. Loop interchange requires data dependence analysis and the compiler might have done that for autovectorization purposes.

OtherCrashOverride · 2015-07-09T01:23:20Z

I don't think this is a good fit for CoreCLR. The runtime's job is to take an intermediate representation (IR) and make it target specific as quickly as possible: JIT = Just In Time (compilation). Additionally, the IR has no concept of loops or other structured programming techniques. It only understands branches (goto). Combine this with the runtime's goal of supporting multiple languages and it makes it very difficult to determine what is a candidate for further optimization and what should not be touched due to side effects that may occur. In the example given above, it is entire possible that another thread is updating the array (from a JIT point of view) and so the order of operation is important.

Given the above, this type of optimization is better suited towards Roslyn, etc. Additionally, compiler optimizations should not be viewed as a substitute for human programming skill.

TPSResearch · 2015-07-09T17:56:56Z

I completely disagree with OtherCrashOverride. In the 90's, developers were instructed to not try to outsmart the compiler, just write readable code and let the optimizer take care of the details of making it fast. Now, we appear to be back to hand-optimizing code because the JIT'er isn't smart enough. Also, the argument that another thread might modify the the array is flawed. Optimizing compilers simply don't work that way...If they did they could never do "loop fusion" for example.

Finally, the idea of "JIT = Just In Time (compilation)" is, IMHO, the biggest problem. I'd bet that the majority of code in use today runs on servers and does not care how long it takes to compile. Result: the mandate is to have a "sub-optimizing" compiler.

masonwheeler · 2015-07-09T19:11:00Z

Agreeing with @TPSResearch. We've always known that JITs are kind of a dumb idea, and this is just another reiteration of that principle. Yes, there's a place for them, particularly in debugging and testing when the goal is to iterate as quickly as possible, but for deployment of a working product, a good AOT compiler will beat a good JIT hands down, every time.

An optimization like this is probably not suitable to the JIT, but it should be shunted over towards LLILC, who has the development of an AOT compiler as their long-term goal.

benpye · 2015-07-09T19:22:44Z

If you are looking to get this in LLILC then wouldn't it really be LLVM where the optimization should be performed?

OtherCrashOverride · 2015-07-09T21:20:46Z

In the 90's, developers were instructed to not try to outsmart the compiler, just write readable code and let the optimizer take care of the details of making it fast. Now, we appear to be back to hand-optimizing code because the JIT'er isn't smart enough.

There will always be a difference in code written for clarity and maintenance and that written for performance.

The main thing that has changed since the 90's is first class Ahead-Of-Time compilation (AOT) support since Windows on ARM does not allow execution of JIT code (W^X). AOT is where we can spend as much time as needed performing deep optimizations in an off-line processes. Its important that the JIT do its job as fast as possible: a server environment may not care; use cases outside of that will. CoreCLR serves many domains outside of servers and languages outside of C#.

I don't think anyone is under the impression that this is a trivial optimization. Should those that write performant code be penalized by a slower JIT that will not find anything to optimize? With AOT the penalty is a O(1) cost, however with JIT its an O(n) cost.

In the end what is being discussed is outside the scope of JIT. It is modifications to IL, not compilation of it. Someone should put forth a pull request of the optimization feature so that it can be tested to answer how much of a degradation it will have on JIT time and also to show real world performance increases that can be expected. Its entirely possible that it will have no statistically meaningful benefit to majority of existing, real world, code.

Eyas · 2015-07-11T22:43:21Z

Right. The takeaway here that applies to many JIT optimization issues are:

Keep in mind this is a JIT compiler. Many complex optimizations incur a runtime cost that is not worthwhile in many cases. This is actually how RyuJIT came to replace the 64-bit JITter. AOT is coming to C# in the form of .NET Native. When there is an appropriate forum to do so, some of these optimization issues should move from CoreCLR to there.
One easy heuristic about when a proposed optimization is certainly out of the scope of a JITter: when very few AOT optimizing compilers actually perform that optimization 😄

masonwheeler · 2015-07-11T22:51:10Z

@Eyas If "AOT is coming to C# in the form of .NET Native," that really doesn't do much for people using other CLR languages, because it doesn't appear that they're open-sourcing the .NET Native toolchain. So that's kind of an unhelpful observation.

OtherCrashOverride · 2015-07-11T23:08:40Z

when very few AOT optimizing compilers actually perform that optimization

I believe this type of optimization is outside the scope of both JIT and AOT. This is something for Roslyn. JIT and AOT deal with IL which is analogous to dealing with processor assembly code. Compilers typically optimize at a higher level. Optimizing in Roslyn is analogous to optimizing in a C/C++ compiler. It makes more sense because it has a better view and understanding of what is going on with the code. However, I think this too is probably better served by a compiler warning, Visual Studio 'lightbulb' suggestion, or an offline tool rather than forcing it on everyone. The author of the code should be made aware and also giving the option to alter the source code or ignore it. The option to ignore it becomes important when there is a purposeful order of operation due to the algorithm being multi-threaded.

ghost · 2015-07-11T23:09:40Z

When there is an appropriate forum to do so, some of these optimization issues should move from CoreCLR to there.

Meanwhile should this be a candidate of Roslyn for IL generation by csc and vbc (where other optimizations are taking place today and where we can afford to spend time for deep analysis)?

@jaredpar, can you please suggest; ideally what should be the plan of attack here?

leppie · 2015-07-12T00:58:32Z

Easy test, can you prove the array is not being mutated?

masonwheeler · 2015-07-12T01:45:31Z

@jasonwilliams200OK @OtherCrashOverride The problem with saying "oh, just do it in the front-end compiler" is that, again, there are a lot more than just one (or even two) CLR languages! "Just do it in the compiler" means we're back to the bad old days when every compiler needs to reinvent every wheel every time. Wasn't getting away from that the entire point of having a common language infrastructure in the first place?

OtherCrashOverride · 2015-07-12T04:39:17Z

Its probably worth mentioning that this is something the LLVM back-end (dotnet/llilc) can provide:
http://llvm.org/docs/Passes.html#licm-loop-invariant-code-motion

Mono has implemented LLVM backend support already. They also retain classic JIT due to the fact that LLVM based JIT adds considerable time to the JIT process which is undesirable to some customers.

mikedn · 2015-07-12T05:55:24Z

Additionally, the IR has no concept of loops or other structured programming techniques. It only understands branches (goto).

That is completely irrelevant. Optimizers do and have done for a long time loop optimizations on IR.

In the example given above, it is entire possible that another thread is updating the array (from a JIT point of view) and so the order of operation is important.

Reads do not constitute side-effects and as such they can be reordered and/or eliminated. The JIT already does this kind of stuff.

Given the above, this type of optimization is better suited towards Roslyn
I believe this type of optimization is outside the scope of both JIT and AOT. This is something for Roslyn.

No, it is not something for Roslyn. Roslyn does very few optimizations on its own and lacks the infrastructure to perform optimizations like the one discussed here. It is certainly in scope of AOT and it might also be in scope of JIT in certain circumstances. At the end of the day, .NET Framework does partial AOT and has done that since day 1. It's called NGEN.

Compilers typically optimize at a higher level. Optimizing in Roslyn is analogous to optimizing in a C/C++ compiler. It makes more sense because it has a better view and understanding of what is going on with the code.

That is incorrect. Compilers actually optimize at a lower level. Optimizing in Roslyn isn't analogous to optimizing in the C/C++ compiler as Roslyn is the equivalent of the frontend and the C/C++ compiler, just like Roslyn, does little to no optimization in the frontend. The better view is simply not there because the frontend simply lacks the infrastructure needed by a lot of modern code optimizations.

Its probably worth mentioning that this is something the LLVM back-end (dotnet/llilc) can provide:
http://llvm.org/docs/Passes.html#licm-loop-invariant-code-motion

RyuJIT already does loop invariant code motion. Neither RyuJIT nor LLVM do loop interchange (well, it's quite possible that LLVM does it but it still doesn't optimize the example code as mentioned in the StackOverflow post).

OtherCrashOverride · 2015-07-12T06:35:27Z

RyuJIT already does loop invariant code motion.

Great! So we can close this issue.

ghost · 2015-07-20T22:52:26Z

@mikedn, can you point me out to the place in RyuJIT code, where this optimization is taking place. I will try to figure out what needs to be done to make loop-interchange optimization possible, as part of the solution from the linked answer is loop interchange (the very first step).

To my knowledge, LLVM, Clang and GCC do not provide loop-interchange optimization either. Neither does MSVCR. Intel has invested heavy amount of resources to make it possible in their compiler. Some areas in LIO are still in research phase, since the heuristics involved are quite prone to backfire given a little mistake.

mikedn · 2015-07-21T14:02:52Z

@jasonwilliams200OK For loop invariant code motion check optHoistLoopCode, optHoistLoopNest, optHoistThisLoop, optHoistLoopExprsForBlock, optHoistLoopExprsForTree, optVNIsLoopInvariant in optimizer.cpp. But that has little to do with loop interchange which is a very different and significantly more complex optimization.

choikwa · 2016-06-15T18:19:13Z

Just wanted to ask something: how is this related to LICM? I see it more as an opportunity to do loop interchange followed by dimensionality reduction of I-loop.
Edit: I see it a bit clearer now. Technically reducing i-loop means all ops inside the loop are LI. So you could reduce it without loop-interchange.

mikedn · 2016-06-15T18:53:20Z

how is this related to LICM?

Once you do loop interchange you might rely on LICM to hoist the if out of the inner loop. Though hoisting an if out of a loop is beyond what LICM usually does, this is more like loop unswitching.

BruceForstall · 2020-10-29T23:52:29Z

Given that loop interchange, loop unswitching, and loop reduction are likely out of the scope of the JIT in the near to medium term (even if we might want them, aspirationally), I'm going to close this issue.

msftgits transferred this issue from dotnet/coreclr Jan 30, 2020

msftgits added this to the Future milestone Jan 30, 2020

BruceForstall mentioned this issue Oct 17, 2020

Improve JIT loop optimizations (.NET 6) #43549

Closed

25 tasks

BruceForstall added the JitUntriaged CLR JIT issues needing additional triage label Oct 28, 2020

BruceForstall closed this as completed Oct 29, 2020

BruceForstall removed the JitUntriaged CLR JIT issues needing additional triage label Oct 29, 2020

BruceForstall changed the title ~~Feature - loop-invariant code motion (hoisting) optimization~~ JIT: loop interchange optimization Oct 29, 2020

ghost locked as resolved and limited conversation to collaborators Jan 5, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

JIT: loop interchange optimization #4358

JIT: loop interchange optimization #4358

ghost commented Jul 8, 2015

mikedn commented Jul 8, 2015

ghost commented Jul 8, 2015

mikedn commented Jul 8, 2015

OtherCrashOverride commented Jul 9, 2015

TPSResearch commented Jul 9, 2015

masonwheeler commented Jul 9, 2015

benpye commented Jul 9, 2015

OtherCrashOverride commented Jul 9, 2015

Eyas commented Jul 11, 2015

masonwheeler commented Jul 11, 2015

OtherCrashOverride commented Jul 11, 2015

ghost commented Jul 11, 2015

leppie commented Jul 12, 2015

masonwheeler commented Jul 12, 2015

OtherCrashOverride commented Jul 12, 2015

mikedn commented Jul 12, 2015

OtherCrashOverride commented Jul 12, 2015

ghost commented Jul 20, 2015

mikedn commented Jul 21, 2015

choikwa commented Jun 15, 2016 •

edited

Loading

mikedn commented Jun 15, 2016

BruceForstall commented Oct 29, 2020

JIT: loop interchange optimization #4358

JIT: loop interchange optimization #4358

Comments

ghost commented Jul 8, 2015

mikedn commented Jul 8, 2015

ghost commented Jul 8, 2015

mikedn commented Jul 8, 2015

OtherCrashOverride commented Jul 9, 2015

TPSResearch commented Jul 9, 2015

masonwheeler commented Jul 9, 2015

benpye commented Jul 9, 2015

OtherCrashOverride commented Jul 9, 2015

Eyas commented Jul 11, 2015

masonwheeler commented Jul 11, 2015

OtherCrashOverride commented Jul 11, 2015

ghost commented Jul 11, 2015

leppie commented Jul 12, 2015

masonwheeler commented Jul 12, 2015

OtherCrashOverride commented Jul 12, 2015

mikedn commented Jul 12, 2015

OtherCrashOverride commented Jul 12, 2015

ghost commented Jul 20, 2015

mikedn commented Jul 21, 2015

choikwa commented Jun 15, 2016 • edited Loading

mikedn commented Jun 15, 2016

BruceForstall commented Oct 29, 2020

choikwa commented Jun 15, 2016 •

edited

Loading