partial cherrypick of @neonsecret's optimized attention CompVis#177

I didn't implement the most consequential part (splitting the softmax in two) because M1 Mac is not so VRAM-constrained. but I implemented the reference-freeing, and also freed x earlier.
Birch-san · Sep 11, 2022 · dab78e9 · dab78e9 · neonsecret · Sep 11, 2022
1 parent 37fdde1
commit dab78e9
Showing 1 changed file with 4 additions and 0 deletions.
diff --git a/ldm/modules/attention.py b/ldm/modules/attention.py
@@ -172,18 +172,22 @@ def forward(self, x, context=None, mask=None):
 
         q = self.to_q(x)
         context = default(context, x)
+        del x
         k = self.to_k(context)
         v = self.to_v(context)
+        del context
 
         q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
 
         sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
+        del q, k
 
         if exists(mask):
             mask = rearrange(mask, 'b ... -> b (...)')
             max_neg_value = -torch.finfo(sim.dtype).max
             mask = repeat(mask, 'b j -> (b h) () j', h=h)
             sim.masked_fill_(~mask, max_neg_value)
+            del mask
 
         # attention, what we cannot get enough of
         attn = sim.softmax(dim=-1)