Skip to content

Commit

Permalink
Fix issues with no code default handlers and tests (deepjavalibrary#660)
Browse files Browse the repository at this point in the history
  • Loading branch information
siddvenk authored Apr 25, 2023
1 parent 8199a6f commit 6a51493
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 24 deletions.
33 changes: 12 additions & 21 deletions engines/python/setup/djl_python/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os

import torch
from transformers import pipeline, Conversation, AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline, Conversation, AutoModelForCausalLM, AutoTokenizer, AutoConfig

from djl_python.encode_decode import encode, decode
from djl_python.inputs import Input
Expand Down Expand Up @@ -104,22 +104,13 @@ def initialize(self, properties: dict):
self.initialized = True
return

if task:
self.hf_pipeline = self.get_pipeline(
task=task,
model_id_or_path=model_id_or_path,
device=device_id,
kwargs=kwargs)
elif "config.json" in os.listdir(model_id_or_path):
task = self.infer_task_from_model_architecture(
f"{model_id_or_path}/config.json")
self.hf_pipeline = self.get_pipeline(
task=task,
model_id_or_path=model_id_or_path,
device=device_id,
kwargs=kwargs)
else:
raise ValueError("You need to define 'task' options.")
if not task:
task = self.infer_task_from_model_architecture(model_id_or_path)

self.hf_pipeline = self.get_pipeline(task=task,
model_id_or_path=model_id_or_path,
device=device_id,
kwargs=kwargs)

self.initialized = True

Expand Down Expand Up @@ -242,17 +233,17 @@ def wrapped_pipeline(inputs, *args, **kwargs):
torch.cuda.current_device())
with torch.no_grad():
output_tokens = model.generate(*args, **input_tokens, **kwargs)
generated_text = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
generated_text = tokenizer.batch_decode(output_tokens,
skip_special_tokens=True)

return [{"generated_text": s} for s in generated_text]

return wrapped_pipeline

@staticmethod
def infer_task_from_model_architecture(model_config_path: str):
with open(model_config_path, "r+") as config_file:
config = json.loads(config_file.read())
architecture = config.get("architectures", [None])[0]
model_config = AutoConfig.from_pretrained(model_config_path)
architecture = model_config.architectures[0]

task = None
for arch_options in ARCHITECTURES_2_TASK:
Expand Down
10 changes: 7 additions & 3 deletions tests/integration/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
"worker": 1
},
"no-code/databricks/dolly-v2-7b": {
"max_memory_per_gpu": [10.0, 10.0, 11.0, 12.0],
"max_memory_per_gpu": [10.0, 10.0, 12.0, 12.0],
"batch_size": [1, 4],
"seq_length": [16, 32],
"worker": 2,
Expand Down Expand Up @@ -277,7 +277,9 @@ def test_handler(model, model_spec):
result
) <= seq_length, "generated more takens than max_new_tokens"
result_0 = json.loads(result[0])['outputs']
assert len(result_0) == batch_size, "batch size number of tokens are not generated"
assert len(
result_0
) == batch_size, "batch size number of tokens are not generated"
else:
res = res.json()
logging.info(f"res {res}")
Expand Down Expand Up @@ -422,7 +424,9 @@ def test_transformers_neuronx_handler(model, model_spec):
if spec.get("stream_output", False):
logging.info(f"res: {res.content}")
result = res.content.decode().split("\n")[:-1]
assert len(result) <= seq_length, "generated more takens than max_new_tokens"
assert len(
result
) <= seq_length, "generated more takens than max_new_tokens"
else:
res = res.json()
logging.info(f"res {res}")
Expand Down

0 comments on commit 6a51493

Please sign in to comment.