Skip to content

Commit

Permalink
[tokenizer] Fixes model zoo import script
Browse files Browse the repository at this point in the history
  • Loading branch information
frankfliu committed Apr 29, 2024
1 parent 0e73488 commit e171941
Showing 1 changed file with 11 additions and 2 deletions.
13 changes: 11 additions & 2 deletions extensions/tokenizers/src/main/python/huggingface_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ def save_rust_model(self, model_info, args: Namespace, temp_dir: str):
include_types = False
else:
return False, f"Unsupported model_type: {config.model_type}", -1
else:
return False, f"Unknown model_type: {model_id}", -1

logging.info(f"Saving rust model: {model_id} ...")

Expand All @@ -116,6 +118,10 @@ def save_rust_model(self, model_info, args: Namespace, temp_dir: str):
logging.warning(e, exc_info=True)
return False, "Failed to save tokenizer", -1

# Save config.json
config_file = hf_hub_download(repo_id=model_id, filename="config.json")
shutil.copyfile(config_file, os.path.join(temp_dir, "config.json"))

target = os.path.join(temp_dir, "model.safetensors")
model = self.api.model_info(model_id, files_metadata=True)
has_sf_file = False
Expand Down Expand Up @@ -188,14 +194,17 @@ def save_pytorch_model(self, model_info, args: Namespace, temp_dir: str):
if not result:
return False, reason, -1

size = self.save_to_model_zoo(model_info, args.output_dir, temp_dir,
hf_pipeline, include_types)
size = self.save_to_model_zoo(model_info, args.output_dir, "PyTorch",
temp_dir, hf_pipeline, include_types)

return True, None, size

@staticmethod
def save_tokenizer(hf_pipeline, temp_dir: str):
hf_pipeline.tokenizer.save_pretrained(temp_dir)
if not os.path.exists(os.path.join(temp_dir, "tokenizer.json")):
raise ValueError("no fast tokenizer found.")

# only keep tokenizer.json file
for path in os.listdir(temp_dir):
if path != "tokenizer.json" and path != "tokenizer_config.json":
Expand Down

0 comments on commit e171941

Please sign in to comment.