Skip to content

Commit

Permalink
Hotfix(MInference): fix the configs in pip (#14)
Browse files Browse the repository at this point in the history
Co-authored-by: Yucheng Li <liyucheng09@gmail.com>
Co-authored-by: Chengruidong Zhang <chengzhang@microsoft.com>
  • Loading branch information
3 people authored Jul 5, 2024
1 parent 00666fb commit 1c2bf70
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 2 deletions.
2 changes: 2 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
recursive-include csrc *.cu
recursive-include csrc *.cpp

recursive-include minference *.json
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ pipe(prompt, max_length=10)
```

for vLLM,
> For now, please use vllm==0.4.1
```diff
from vllm import LLM, SamplingParams
Expand Down
8 changes: 6 additions & 2 deletions minference/modules/minference_forward.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,17 @@
import inspect
import json
import os
import warnings
from importlib import import_module

from transformers.models.llama.modeling_llama import *
from transformers.utils.import_utils import _is_package_available

if _is_package_available("vllm"):
from vllm.attention.backends.flash_attn import *
try:
from vllm.attention.backends.flash_attn import *
except:
warnings.warn("Only support 'vllm==0.4.1'. Please update your vllm version.")

from ..ops.block_sparse_flash_attention import block_sparse_attention
from ..ops.pit_sparse_flash_attention_v2 import vertical_slash_sparse_attention
Expand Down Expand Up @@ -768,7 +772,7 @@ def forward(
key: torch.Tensor,
value: torch.Tensor,
kv_cache: torch.Tensor,
attn_metadata: AttentionMetadata[FlashAttentionMetadata],
attn_metadata,
kv_scale: float,
layer_idx: int,
) -> torch.Tensor:
Expand Down

0 comments on commit 1c2bf70

Please sign in to comment.