improve UT coverage of PT Utils and Quantization (#1842)

* update UTs --------- Signed-off-by: xin3he <xin3.he@intel.com> Signed-off-by: xinhe3 <xinhe3@habana.ai>
intel · Jun 12, 2024 · e9cb48c · e9cb48c
1 parent 6b27383
commit e9cb48c
Show file tree

Hide file tree

Showing 14 changed files with 346 additions and 254 deletions.
diff --git a/....x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt b/....x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt
@@ -11,3 +11,5 @@ neural-compressor
 intel-extension-for-transformers
 lm_eval==0.4.2
 peft
+auto_round
+intel_extension_for_pytorch
diff --git a/...i/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/...i/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py
@@ -236,9 +236,10 @@ def get_user_model():
     # 3.x api
     if args.approach == 'weight_only':
         from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize
-        from neural_compressor.torch.utils import get_double_quant_config
+        from neural_compressor.torch.utils import get_double_quant_config_dict
         weight_sym = True if args.woq_scheme == "sym" else False
-        double_quant_config_dict = get_double_quant_config(args.double_quant_type)
+        if args.double_quant_type is not None:
+            double_quant_config_dict = get_double_quant_config_dict(args.double_quant_type)
 
         if args.woq_algo == "RTN":
             if args.double_quant_type is not None:

diff --git a/neural_compressor/torch/algorithms/pt2e_quant/core.py b/neural_compressor/torch/algorithms/pt2e_quant/core.py
@@ -26,7 +26,7 @@
 from neural_compressor.common.utils import logger
 from neural_compressor.torch.algorithms.base_algorithm import Quantizer
 from neural_compressor.torch.algorithms.pt2e_quant import half_precision_rewriter as hp_rewriter
-from neural_compressor.torch.utils import create_xiq_quantizer_from_pt2e_config
+from neural_compressor.torch.algorithms.pt2e_quant.utility import create_xiq_quantizer_from_pt2e_config
 
 
 class W8A8PT2EQuantizer(Quantizer):

diff --git a/neural_compressor/torch/algorithms/pt2e_quant/utility.py b/neural_compressor/torch/algorithms/pt2e_quant/utility.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+import torch
+import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
+from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver, PlaceholderObserver
+from torch.ao.quantization.quantizer import QuantizationSpec
+from torch.ao.quantization.quantizer.x86_inductor_quantizer import QuantizationConfig, X86InductorQuantizer
+
+
+def create_quant_spec_from_config(dtype, sym, granularity, algo, is_dynamic=False) -> QuantizationSpec:
+    dtype_mapping: Dict[str, torch.dtype] = {"int8": torch.int8, "uint8": torch.uint8}
+    select_dtype = dtype_mapping[dtype]
+    min_max_mapping = {torch.int8: (-128, 127), torch.uint8: (0, 255)}
+    qscheme_mapping = {
+        "per_channel": {True: torch.per_channel_symmetric, False: torch.per_tensor_affine},
+        "per_tensor": {True: torch.per_tensor_symmetric, False: torch.per_tensor_affine},
+    }
+    observer_mapping = {
+        "placeholder": PlaceholderObserver,
+        "minmax": MinMaxObserver,
+        "kl": HistogramObserver,
+    }
+    # Force to use placeholder observer for dynamic quantization
+    if is_dynamic:
+        algo = "placeholder"
+    # algo
+    observer_or_fake_quant_ctr = observer_mapping[algo]
+    # qscheme
+    qscheme = qscheme_mapping[granularity][sym]
+    quantization_spec = QuantizationSpec(
+        dtype=select_dtype,
+        quant_min=min_max_mapping[select_dtype][0],
+        quant_max=min_max_mapping[select_dtype][1],
+        observer_or_fake_quant_ctr=observer_or_fake_quant_ctr,
+        qscheme=qscheme,
+        is_dynamic=is_dynamic,
+    )
+    return quantization_spec
+
+
+def _map_inc_config_to_torch_quant_config(inc_config, is_dynamic=False) -> QuantizationConfig:
+    default_quant_config = xiq.get_default_x86_inductor_quantization_config(is_dynamic=is_dynamic)
+    input_act_quant_spec = create_quant_spec_from_config(
+        inc_config.act_dtype, inc_config.act_sym, inc_config.act_granularity, inc_config.act_algo, is_dynamic=is_dynamic
+    )
+    weight_quant_spec = create_quant_spec_from_config(
+        inc_config.w_dtype, inc_config.w_sym, inc_config.w_granularity, inc_config.w_algo
+    )
+    quant_config = QuantizationConfig(
+        input_activation=input_act_quant_spec,
+        output_activation=default_quant_config.output_activation,
+        weight=weight_quant_spec,
+        bias=default_quant_config.bias,
+        is_qat=False,
+    )
+    return quant_config
+
+
+def create_xiq_quantizer_from_pt2e_config(config, is_dynamic=False) -> X86InductorQuantizer:
+    quantizer = xiq.X86InductorQuantizer()
+    # set global
+    global_config = _map_inc_config_to_torch_quant_config(config, is_dynamic)
+    quantizer.set_global(global_config)
+    # Skip the local config for now (need torch 2.4)
+    return quantizer
diff --git a/neural_compressor/torch/utils/auto_accelerator.py b/neural_compressor/torch/utils/auto_accelerator.py
@@ -98,7 +98,7 @@ class CUDA_Accelerator:
     return accelerator_registry.register_accelerator_impl(name=name, priority=priority)
 
 
-class Auto_Accelerator(ABC):
+class Auto_Accelerator(ABC):  # pragma: no cover
     @classmethod
     @abstractmethod
     def is_available(cls) -> bool:
@@ -175,7 +175,7 @@ def synchronize(self):
 
 
 @register_accelerator(name="cuda", priority=PRIORITY_CUDA)
-class CUDA_Accelerator(Auto_Accelerator):
+class CUDA_Accelerator(Auto_Accelerator):  # pragma: no cover
     def __init__(self) -> None:
         self._name = "cuda"
 
@@ -211,7 +211,7 @@ def empty_cache(self):
 
 
 @register_accelerator(name="xpu", priority=PRIORITY_XPU)
-class XPU_Accelerator(Auto_Accelerator):
+class XPU_Accelerator(Auto_Accelerator):  # pragma: no cover
     def __init__(self) -> None:
         self._name = "xpu"
 
@@ -250,7 +250,7 @@ def empty_cache(self):
 
 
 @register_accelerator(name="hpu", priority=PRIORITY_HPU)
-class HPU_Accelerator(Auto_Accelerator):
+class HPU_Accelerator(Auto_Accelerator):  # pragma: no cover
     def __init__(self) -> None:
         self._name = "hpu"
 
@@ -275,7 +275,10 @@ def synchronize(self):
         return torch.hpu.synchronize()
 
     def set_device(self, device_index):
-        return torch.hpu.set_device(device_index)
+        try:
+            torch.hpu.set_device(device_index)
+        except Exception as e:
+            logger.warning(e)
 
     def current_device(self):
         return torch.hpu.current_device()
@@ -287,7 +290,10 @@ def device(self, device_index=None):
         return torch.hpu.device(device_index)
 
     def empty_cache(self):
-        return torch.hpu.empty_cache()
+        try:
+            torch.hpu.empty_cache()
+        except Exception as e:
+            logger.warning(e)
 
     def mark_step(self):
         return htcore.mark_step()

diff --git a/neural_compressor/torch/utils/environ.py b/neural_compressor/torch/utils/environ.py
@@ -13,24 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib
 import sys
 
 import torch
 from packaging.version import Version
 
-# pylint:disable=import-error
-try:
-    import habana_frameworks.torch.hpex
-
-    _hpex_available = True
-except:
-    _hpex_available = False
-
-
-def is_hpex_available():
-    return _hpex_available
-
 
+################ Check imported sys.module first to decide behavior #################
 def is_ipex_imported() -> bool:
     for name, _ in sys.modules.items():
         if name == "intel_extension_for_pytorch":
@@ -45,11 +35,29 @@ def is_transformers_imported() -> bool:
     return False
 
 
-try:
-    import intel_extension_for_pytorch as ipex
+################ Check available sys.module to decide behavior #################
+def is_package_available(package_name):
+    from importlib.util import find_spec
+
+    package_spec = find_spec(package_name)
+    return package_spec is not None
+
 
+## check hpex
+if is_package_available("habana_frameworks"):
+    _hpex_available = True
+else:
+    _hpex_available = False
+
+
+def is_hpex_available():
+    return _hpex_available
+
+
+## check ipex
+if is_package_available("intel_extension_for_pytorch"):
     _ipex_available = True
-except:
+else:
     _ipex_available = False
 
 
@@ -60,6 +68,8 @@ def is_ipex_available():
 def get_ipex_version():
     if is_ipex_available():
         try:
+            import intel_extension_for_pytorch as ipex
+
             ipex_version = ipex.__version__.split("+")[0]
         except ValueError as e:  # pragma: no cover
             assert False, "Got an unknown version of intel_extension_for_pytorch: {}".format(e)

diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
@@ -16,10 +16,6 @@
 from typing import Callable, Dict, List, Tuple, Union
 
 import torch
-import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
-from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver, PlaceholderObserver
-from torch.ao.quantization.quantizer import QuantizationSpec
-from torch.ao.quantization.quantizer.x86_inductor_quantizer import QuantizationConfig, X86InductorQuantizer
 from typing_extensions import TypeAlias
 
 from neural_compressor.common import logger
@@ -120,11 +116,9 @@ def get_model_info(model: torch.nn.Module, white_module_list: List[Callable]) ->
     return filter_result
 
 
-def get_double_quant_config(double_quant_type):
+def get_double_quant_config_dict(double_quant_type="BNB_NF4"):
     from neural_compressor.torch.utils.constants import DOUBLE_QUANT_CONFIGS
 
-    if double_quant_type is None:
-        return {}
     assert double_quant_type in DOUBLE_QUANT_CONFIGS, "Supported double quant configs: {}".format(
         list(DOUBLE_QUANT_CONFIGS.keys())
     )
@@ -170,61 +164,3 @@ def postprocess_model(model, mode, quantizer):
     elif mode == Mode.CONVERT or mode == Mode.QUANTIZE:
         if getattr(model, "quantizer", False):
             del model.quantizer
-
-
-def create_quant_spec_from_config(dtype, sym, granularity, algo, is_dynamic=False) -> QuantizationSpec:
-    dtype_mapping: Dict[str, torch.dtype] = {"int8": torch.int8, "uint8": torch.uint8}
-    select_dtype = dtype_mapping[dtype]
-    min_max_mapping = {torch.int8: (-128, 127), torch.uint8: (0, 255)}
-    qscheme_mapping = {
-        "per_channel": {True: torch.per_channel_symmetric, False: torch.per_tensor_affine},
-        "per_tensor": {True: torch.per_tensor_symmetric, False: torch.per_tensor_affine},
-    }
-    observer_mapping = {
-        "placeholder": PlaceholderObserver,
-        "minmax": MinMaxObserver,
-        "kl": HistogramObserver,
-    }
-    # Force to use placeholder observer for dynamic quantization
-    if is_dynamic:
-        algo = "placeholder"
-    # algo
-    observer_or_fake_quant_ctr = observer_mapping[algo]
-    # qscheme
-    qscheme = qscheme_mapping[granularity][sym]
-    quantization_spec = QuantizationSpec(
-        dtype=select_dtype,
-        quant_min=min_max_mapping[select_dtype][0],
-        quant_max=min_max_mapping[select_dtype][1],
-        observer_or_fake_quant_ctr=observer_or_fake_quant_ctr,
-        qscheme=qscheme,
-        is_dynamic=is_dynamic,
-    )
-    return quantization_spec
-
-
-def _map_inc_config_to_torch_quant_config(inc_config, is_dynamic=False) -> QuantizationConfig:
-    default_quant_config = xiq.get_default_x86_inductor_quantization_config(is_dynamic=is_dynamic)
-    input_act_quant_spec = create_quant_spec_from_config(
-        inc_config.act_dtype, inc_config.act_sym, inc_config.act_granularity, inc_config.act_algo, is_dynamic=is_dynamic
-    )
-    weight_quant_spec = create_quant_spec_from_config(
-        inc_config.w_dtype, inc_config.w_sym, inc_config.w_granularity, inc_config.w_algo
-    )
-    quant_config = QuantizationConfig(
-        input_activation=input_act_quant_spec,
-        output_activation=default_quant_config.output_activation,
-        weight=weight_quant_spec,
-        bias=default_quant_config.bias,
-        is_qat=False,
-    )
-    return quant_config
-
-
-def create_xiq_quantizer_from_pt2e_config(config, is_dynamic=False) -> X86InductorQuantizer:
-    quantizer = xiq.X86InductorQuantizer()
-    # set global
-    global_config = _map_inc_config_to_torch_quant_config(config, is_dynamic)
-    quantizer.set_global(global_config)
-    # Skip the local config for now (need torch 2.4)
-    return quantizer
diff --git a/requirements_pt.txt b/requirements_pt.txt
@@ -1,5 +1,4 @@
-auto-round
-intel_extension_for_pytorch
+numpy
 peft==0.10.0
 psutil
 py-cpuinfo

diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py
@@ -14,7 +14,7 @@
     prepare,
     quantize,
 )
-from neural_compressor.torch.utils import accelerator
+from neural_compressor.torch.utils import accelerator, is_hpex_available
 
 device = accelerator.current_device_name()
 
@@ -76,6 +76,8 @@ def test_int_params(self, bits, use_sym, group_size, group_dim):
         model = convert(model)
         out = model(self.example_inputs)[0]
         assert (out != self.label).any(), "WOQ output should be different with raw output"
+        if is_hpex_available():
+            assert "hpu" in out.device, "Neural Compressor should run on HPU when HPEX is available."
         if (bits, use_sym, group_size, group_dim) == (8, True, -1, 1):
             assert torch.allclose(out, self.label, atol=0.01), "Accuracy gap atol > 0.01 is unexpected."
         if (bits, use_sym, group_size, group_dim) == [(4, True, 128, 0), (4, True, 32, 1)]:

diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt
@@ -1,4 +1,6 @@
+auto_round
 expecttest
+intel_extension_for_pytorch
 numpy
 peft==0.10.0
 prettytable