facebookresearch · fmassa · Oct 19, 2022 · Sep 30, 2022 · Oct 10, 2022 · Oct 13, 2022
diff --git a/xformers/__init__.py b/xformers/__init__.py
@@ -12,6 +12,9 @@
 except ImportError:
     pass
 
+
+logger = logging.getLogger("xformers")
+
 _is_sparse_available: bool = True
 
 # Set to true to utilize functorch
@@ -66,8 +69,8 @@ def _register_extensions():
         _register_extensions()
     except (ImportError, OSError) as e:
         print(e)
-        logging.warning(
-            f"WARNING: {e}\nNeed to compile C++ extensions to get sparse attention suport."
+        logger.warning(
+            f"WARNING: {e}\nNeed to compile C++ extensions to get sparse attention support."
             + " Please run python setup.py build develop"
         )
         _is_sparse_available = False
@@ -94,7 +97,7 @@ def _is_triton_available():
 
         return True
     except (ImportError, AttributeError) as e:
-        logging.warning(
+        logger.warning(
             f"A matching Triton is not available, some optimizations will not be enabled.\nError caught was: {e}"
         )
         return False
@@ -104,7 +107,7 @@ def _is_triton_available():
     try:
         from xformers.components.nvfuser import NVFusedBiasActivationDropout  # noqa
     except ImportError as e:
-        logging.warning(
+        logger.warning(
             f"Functorch is not available, some optimizations will not be enabled.\nError caught was: {e}"
         )
         _is_functorch_available = False
diff --git a/xformers/components/attention/__init__.py b/xformers/components/attention/__init__.py
@@ -19,6 +19,9 @@
 from .attention_mask import AttentionMask
 from .base import Attention, AttentionConfig  # noqa
 
+logger = logging.getLogger("xformers")
+
+
 # CREDITS: Classy Vision registry mechanism
 
 ATTENTION_REGISTRY: Dict[str, Any] = {}
@@ -45,7 +48,7 @@ def build_attention(config: Union[Dict[str, Any], AttentionConfig]):
             )
         except KeyError as e:
             name = config["name"]
-            logging.warning(f"{name} not available among {ATTENTION_REGISTRY.keys()}")
+            logger.warning(f"{name} not available among {ATTENTION_REGISTRY.keys()}")
             raise e
     else:
         config_instance = config

diff --git a/xformers/components/attention/blocksparse.py b/xformers/components/attention/blocksparse.py
@@ -13,7 +13,12 @@
 from xformers import _is_triton_available
 from xformers.components.attention import Attention, AttentionConfig, register_attention
 
+logger = logging.getLogger("xformers")
+
+
 _is_blocksparse_available = _is_triton_available()
+
+
 if _is_blocksparse_available:
     from triton.ops.blocksparse import matmul as blocksparse_matmul  # type: ignore
     from triton.ops.blocksparse import softmax as blocksparse_softmax  # type: ignore
@@ -22,7 +27,7 @@
 
     # Blocksparse requires Tensor cores
     if gpu_capabilities_older_than_70():
-        logging.warning(
+        logger.warning(
             "Blocksparse is not available: the current GPU does not expose Tensor cores"
         )
         _is_blocksparse_available = False
@@ -64,14 +69,14 @@ def __init__(
             **kwargs,
         ):
             if layout.dim() == 2:
-                logging.warning(
+                logger.warning(
                     "The layout passed is lacking a head dimension and a batch dimension"
                 )
-                logging.warning(
+                logger.warning(
                     "Now assuming that the same layout is to be used across all heads"
                 )
                 layout = layout.unsqueeze(0).expand(num_heads, -1, -1)
-                logging.warning(f"New layout dimensions: {layout.shape}")
+                logger.warning(f"New layout dimensions: {layout.shape}")
 
             assert block_size in (
                 16,

diff --git a/xformers/components/attention/core.py b/xformers/components/attention/core.py
@@ -30,6 +30,9 @@
     from xformers.components.attention.blocksparse import BlockSparseAttention
 
 
+logger = logging.getLogger("xformers")
+
+
 def _create_random_sparsity(matrix, sparsity, divisible_by=4):
     assert matrix.ndim == 3
     keep = torch.rand_like(matrix[0], dtype=torch.float32) > sparsity
@@ -311,7 +314,7 @@ def scaled_dot_product_attention(
     )
 
     if switch_to_blocksparse:
-        logging.info("Switching causal attention to Triton blocksparse...")
+        logger.info("Switching causal attention to Triton blocksparse...")
         return blocksparse_attention(q, k, v, dropout, block_size)
 
     with torch.cuda.amp.autocast(enabled=False) if autocast_disabled else nullcontext():

diff --git a/xformers/components/attention/favor.py b/xformers/components/attention/favor.py
@@ -21,6 +21,8 @@
     SMReg,
 )
 
+logger = logging.getLogger("xformers")
+
 
 @dataclass
 class FavorAttentionConfig(AttentionConfig):
@@ -84,7 +86,7 @@ def __init__(
             self.dim_features = 2 * (
                 self.dim_features // 2
             )  # needs to be even for some variants
-            logging.info(
+            logger.info(
                 f"FAVOR: Automatically setting the random mapping dimension to {self.dim_features} from {dim_head}"
             )
         else:

diff --git a/xformers/components/attention/nystrom.py b/xformers/components/attention/nystrom.py
@@ -22,6 +22,8 @@
     reshape_key_padding_mask,
 )
 
+logger = logging.getLogger("xformers")
+
 
 @dataclass
 class NystromSelfAttentionConfig(AttentionConfig):
@@ -182,7 +184,7 @@ def forward(
 
         if key_padding_mask is not None:
             if key_padding_mask.dtype == torch.bool:
-                logging.warning(
+                logger.warning(
                     "Bool mask found, but an additive mask is expected. Converting but this is slow"
                 )
 

diff --git a/xformers/components/attention/ortho.py b/xformers/components/attention/ortho.py
@@ -20,6 +20,8 @@
     scaled_query_key_softmax,
 )
 
+logger = logging.getLogger("xformers")
+
 
 class LandmarkSelection(str, Enum):
     Orthogonal = "orthogonal"
@@ -105,7 +107,7 @@ def forward(
                     landmarks = self._cluster_landmarks(q, spherical=True)
 
             if att_mask is not None:
-                logging.warning(
+                logger.warning(
                     "Orthoformer: attention mask passed alongside with using landmarks to reduce dimensions. \
                     The two are typically not compatible"
                 )

diff --git a/xformers/components/attention/scaled_dot_product.py b/xformers/components/attention/scaled_dot_product.py
@@ -18,6 +18,8 @@
 )
 from xformers.components.attention.core import scaled_dot_product_attention
 
+logger = logging.getLogger("xformers")
+
 
 @dataclass
 class ScaledDotProductConfig(AttentionConfig):
@@ -118,7 +120,7 @@ def forward(
             if isinstance(att_mask, AttentionMask):
                 att_mask = att_mask.make_crop(seq_len=q.shape[-2])
             else:
-                logging.error(
+                logger.error(
                     "Mismatching sparse attention mask and sequence length."
                     + " Please pad the inputs or adjust the attention mask"
                 )

diff --git a/xformers/components/feedforward/fused_mlp.py b/xformers/components/feedforward/fused_mlp.py
@@ -17,6 +17,9 @@
     register_feedforward,
 )
 
+logger = logging.getLogger("xformers")
+
+
 if torch.cuda.is_available():
     try:
         from xformers.triton import FusedDropoutBias
@@ -73,4 +76,4 @@ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
                 return self.mlp(inputs)
 
     except ImportError:
-        logging.warning("Triton is not available, FusedMLP will not be enabled.")
+        logger.warning("Triton is not available, FusedMLP will not be enabled.")
diff --git a/xformers/components/feedforward/mixture_of_experts.py b/xformers/components/feedforward/mixture_of_experts.py
@@ -18,6 +18,9 @@
     register_feedforward,
 )
 
+logger = logging.getLogger("xformers")
+
+
 _is_fairscale_available = True
 
 try:
@@ -27,7 +30,7 @@
     from xformers.components.feedforward import MLP
 
 except ImportError:
-    logging.warning(
+    logger.warning(
         "Either FairScale or torch distributed is not available, MixtureOfExperts will not be exposed."
         " Please install them if you would like to use MoE"
     )
@@ -105,8 +108,8 @@ def __init__(
                 assert number_of_experts >= number_of_local_experts
             else:
                 if dist.get_world_size() == 1:
-                    logging.warning("Local experts no specified but world size of 1")
-                    logging.warning("Assuming that all experts are local")
+                    logger.warning("Local experts no specified but world size of 1")
+                    logger.warning("Assuming that all experts are local")
                     number_of_local_experts = number_of_experts
                 else:
                     number_of_local_experts = 1

diff --git a/xformers/components/input_projection.py b/xformers/components/input_projection.py
@@ -14,6 +14,8 @@
 import torch
 from torch import nn
 
+logger = logging.getLogger("xformers")
+
 
 @dataclass
 class InputProjectionConfig:
@@ -53,7 +55,7 @@ def __init__(
                 key_proj_params.bias,
             )
         else:
-            logging.info(
+            logger.info(
                 "No Key projection parameters were passed, assuming that the weights"
                 + " are shared with the query projection"
             )
@@ -66,7 +68,7 @@ def __init__(
                 value_proj_params.bias,
             )
         else:
-            logging.info(
+            logger.info(
                 "No Value projection parameters were passed, assuming that the weights"
                 + " are shared with the query projection"
             )

diff --git a/xformers/components/multi_head_dispatch.py b/xformers/components/multi_head_dispatch.py
@@ -16,6 +16,8 @@
 from xformers.components.input_projection import InputProjection, InputProjectionConfig
 from xformers.components.positional_embedding import RotaryEmbedding
 
+logger = logging.getLogger("xformers")
+
 
 @dataclass
 class MultiHeadDispatchConfig:
@@ -90,7 +92,7 @@ def __init__(
         super().__init__()
 
         if isinstance(bias, bool):
-            logging.warning(
+            logger.warning(
                 "Single bias value provided for the MHA projections."
                 + f" Assuming the same parameter ({bias}) is to be used everywhere"
             )

diff --git a/xformers/factory/block_factory.py b/xformers/factory/block_factory.py
@@ -30,6 +30,8 @@
     xFormerEncoderConfig,
 )
 
+logger = logging.getLogger("xformers")
+
 
 def _get_ln_factory(
     d_model: int,
@@ -113,7 +115,7 @@ def __init__(self, config: xFormerEncoderConfig, **kwargs):
             mha_dim = config.multi_head_config["dim_model"]
 
             if pos_encoding_dim != mha_dim:
-                logging.warning(
+                logger.warning(
                     f"The embedding dim and model dim do not match ({pos_encoding_dim} vs {mha_dim}), adding a projector layer."  # noqa
                 )
                 self.embedding_projector = nn.Linear(pos_encoding_dim, mha_dim)
@@ -257,7 +259,7 @@ def __init__(self, config: xFormerDecoderConfig, **kwargs):
 
             if pos_encoding_dim != mha_dim:
 
-                logging.warning(
+                logger.warning(
                     f"The embedding dim and model dim do not match ({pos_encoding_dim} vs {mha_dim}), adding a projector layer."  # noqa
                 )
 

diff --git a/xformers/factory/hydra_helper.py b/xformers/factory/hydra_helper.py
@@ -14,7 +14,7 @@
 from xformers.components.feedforward import FEEDFORWARD_REGISTRY
 from xformers.components.positional_embedding import POSITION_EMBEDDING_REGISTRY
 
-log = logging.getLogger(__name__)
+logger = logging.getLogger("xformers")
 
 
 def import_xformer_config_schema():
@@ -33,4 +33,4 @@ def import_xformer_config_schema():
             try:
                 cs.store(name=f"{kk}_schema", node=v[kk].config, group=f"xformers/{k}")
             except ValidationError as e:
-                log.debug(f"Error registering {kk}_schema, error: {e}")
+                logger.debug(f"Error registering {kk}_schema, error: {e}")
diff --git a/xformers/factory/model_factory.py b/xformers/factory/model_factory.py
@@ -20,6 +20,8 @@
 from xformers.factory.block_factory import xFormerDecoderBlock, xFormerEncoderBlock
 from xformers.factory.weight_init import get_weight_init_fn, xFormerWeightInit
 
+logger = logging.getLogger("xformers")
+
 
 @dataclass(init=False)
 class xFormerConfig:
@@ -183,7 +185,7 @@ def __init__(
             and decoders[0].pose_encoding
             and not config.reversible
         ):
-            logging.info("Tying encoder and decoder embeddings, as requested")
+            logger.info("Tying encoder and decoder embeddings, as requested")
             encoders[0].pose_encoding = decoders[0].pose_encoding
 
         self.encoders: torch.nn.Module = (

diff --git a/xformers/factory/weight_init.py b/xformers/factory/weight_init.py
@@ -20,6 +20,9 @@
     _no_grad_uniform_,
 )
 
+logger = logging.getLogger("xformers")
+
+
 _assert_if_not_initialized = False
 
 
@@ -125,7 +128,7 @@ def _maybe_report_no_init(module, name):
             return
 
         # This is unexpected, warn about a possible unhandled weight
-        logging.warning(
+        logger.warning(
             f"Not initializing weights in {name}, this could be a mistake.\nModule {module}"
         )
 

diff --git a/xformers/sparse/blocksparse_tensor.py b/xformers/sparse/blocksparse_tensor.py
@@ -3,17 +3,20 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
+
 import torch
 
 from xformers.ops import masked_matmul
 
+logger = logging.getLogger("xformers")
+
+
 try:
     from triton.ops.blocksparse import matmul as blocksparse_matmul
     from triton.ops.blocksparse import softmax as blocksparse_softmax
 except ImportError as e:
-    import logging
-
-    logging.warning(
+    logger.warning(
         "Triton is not available, some optimizations will not be enabled.\n"
         + f"This is just a warning: {e}"
     )