Merge branch 'master' of github.com:taichi-dev/taichi into matrix_nda…

…rray_pr3
taichi-dev · Aug 29, 2022 · 73d3fce · 73d3fce
2 parents 87f51da + 256c7e8
commit 73d3fce
Show file tree

Hide file tree

Showing 31 changed files with 386 additions and 386 deletions.
diff --git a/.github/workflows/scripts/unix-build-v2.sh b/.github/workflows/scripts/unix-build-v2.sh
@@ -26,7 +26,6 @@ build_taichi_wheel() {
     fi
     python3 misc/make_changelog.py --ver origin/master --repo_dir ./ --save
 
-    export TAICHI_CMAKE_ARGS="${TAICHI_CMAKE_ARGS} -DTI_WITH_C_API=ON"
     python3 setup.py $PROJECT_TAGS bdist_wheel $EXTRA_ARGS
     sccache -s
 }

diff --git a/.github/workflows/scripts/unix_build.sh b/.github/workflows/scripts/unix_build.sh
@@ -23,7 +23,6 @@ build_taichi_wheel() {
     fi
     python3 misc/make_changelog.py --ver origin/master --repo_dir ./ --save
 
-    TAICHI_CMAKE_ARGS="${TAICHI_CMAKE_ARGS} -DTI_WITH_C_API=ON"
     exec env TAICHI_CMAKE_ARGS="${TAICHI_CMAKE_ARGS}" python3 setup.py $PROJECT_TAGS bdist_wheel $EXTRA_ARGS
     sccache -s
 }

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -170,7 +170,7 @@ jobs:
         env:
           PY: ${{ matrix.python }}
           PROJECT_NAME: taichi
-          TAICHI_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_CC:BOOL=${{ matrix.with_cc }} -DTI_WITH_VULKAN:BOOL=OFF -DTI_BUILD_TESTS:BOOL=ON -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
+          TAICHI_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_CC:BOOL=${{ matrix.with_cc }} -DTI_WITH_VULKAN:BOOL=OFF -DTI_BUILD_TESTS:BOOL=ON -DTI_WITH_C_API=ON -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
 
       - name: Test
         id: test
@@ -262,7 +262,7 @@ jobs:
           .github/workflows/scripts/unix_build.sh
           brew uninstall molten-vk
         env:
-          TAICHI_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_CC:BOOL=${{ matrix.with_cc }} -DTI_WITH_VULKAN:BOOL=ON -DTI_BUILD_TESTS:BOOL=${{ matrix.with_cpp_tests }} -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
+          TAICHI_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_CC:BOOL=${{ matrix.with_cc }} -DTI_WITH_VULKAN:BOOL=ON -DTI_WITH_C_API=ON -DTI_BUILD_TESTS:BOOL=${{ matrix.with_cpp_tests }} -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
           CXX: clang++
 
       # [DEBUG] Copy this step around to enable debugging inside Github Action instances.
@@ -667,6 +667,7 @@ jobs:
             -DTI_WITH_OPENGL:BOOL=ON
             -DTI_WITH_CC:BOOL=OFF
             -DTI_WITH_VULKAN:BOOL=ON
+            -DTI_WITH_C_API=ON
             -DCMAKE_C_COMPILER_LAUNCHER=sccache
             -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
 

diff --git a/docs/lang/articles/advanced/quant.md b/docs/lang/articles/advanced/quant.md
@@ -2,81 +2,96 @@
 sidebar_position: 3
 ---
 
-# Using quantized data types
+# Use quantized data types
 
-High-resolution simulations can deliver great visual quality, but they are often
-limited by available memory, especially on GPUs. For the sake of saving memory,
-Taichi provides low-precision ("quantized") data types. You can define your own integers,
-fixed-point numbers or floating-point numbers with non-standard number of bits so
-that you can choose a proper setting with minimum memory for your applications.
-Taichi provides a suite of tailored domain-specific optimizations to ensure the
-runtime performance with quantized data types close to that with full-precision
-data types.
+High-resolution simulations can deliver great visual quality, but are often limited by the capacity of the onboard memory, GPU memory in particular.
+
+To help reduce the memory footprint of your programs, Taichi provides quantized data types, aka low-precision data types. It allows you to define your own integers, fixed-point numbers, or floating-point numbers with arbitrary number of bits that work best with your limited memory capacity. At the same time, Taichi provides a suite of tailored optimizations to ensure that the runtime performance with quantized data types is comparable to the performance with full-precision data types.
 
 :::note
-Quantized data types are only supported on CPU and CUDA backends for now.
+For now, quantized data types are supported only on the CPU and CUDA backends.
 :::
 
 ## Quantized data types
 
+Taichi supports the following quantized data types:
+
+- Quantized integers
+- Quantized fixed-point numbers
+- Quantized floating-point numbers
+
 ### Quantized integers
 
-Modern computers represent integers using the [two's complement](https://en.wikipedia.org/wiki/Two%27s_complement)
-format. *Quantized integers* in Taichi adopt the same format, and can contain
-non-standard number of bits:
+Quantized integers in Taichi are represented in the [two's complement](https://en.wikipedia.org/wiki/Two's_complement) format but can contain arbitrary number of bits.
+
+- To define a 10-bit signed integer type:
 
 ```python
-i10 = ti.types.quant.int(bits=10)  # 10-bit signed (default) integer type
-u5 = ti.types.quant.int(bits=5, signed=False)  # 5-bit unsigned integer type
+i10 = ti.types.quant.int(bits=10)  # `signed` is set to `True` by default
+```
+
+- To define a 5-bit unsigned integer type:
+
+```python
+u5 = ti.types.quant.int(bits=5, signed=False)
 ```
 
 ### Quantized fixed-point numbers
 
-[Fixed-point numbers](https://en.wikipedia.org/wiki/Fixed-point_arithmetic) are
-an old way to represent real numbers. The internal representation of a fixed-point number is simply an integer, and
-its actual value equals to the integer multiplied by a predefined scaling
-factor. Based on the support for quantized integers, Taichi provides *quantized
-fixed-point numbers* as follows:
+The core idea of [fixed-point numbers](https://en.wikipedia.org/wiki/Fixed-point_arithmetic) is that, if a specific range is evenly divided into multiple scale units, then a real number within that range can be approximated and represented by multiplying the value of each scale unit by an integer number. Here's an example explaining what the *scale unit* here is: If you wish to represent a real number within [0, 100] in 10 binary bits, then each *scale unit* equals 100/2<sup>10</sup> &asymp; 0.098.
+
+Taichi allows you to define quantized fixed-point types of less than 64 bits and with an arbitrary scale unit.
+
+- To define a 10-bit signed fixed-point type within the range [-20.0, 20.0]:
 
 ```python
-fixed_type_a = ti.types.quant.fixed(bits=10, max_value=20.0)  # 10-bit signed (default) fixed-point type within [-20.0, 20.0]
-fixed_type_b = ti.types.quant.fixed(bits=5, signed=False, max_value=100.0)  # 5-bit unsigned fixed-point type within [0.0, 100.0]
-fixed_type_c = ti.types.quant.fixed(bits=6, signed=False, scale=1.0)  # 6-bit unsigned fixed-point type within [0, 64.0]
+fixed_type_a = ti.types.quant.fixed(bits=10, max_value=20.0)  # `signed` is set to `True` by default
 ```
 
-`scale` is the scaling factor mentioned above. Because fixed-point numbers are
-especially useful when you know the actual value is guaranteed to be within a
-range, Taichi allows you to simply set `max_value` and will calculate the
-scaling factor accordingly.
+- To define a 5-bit unsigned fixed-point type within the range [0.0, 100.0]:
+
+```python
+fixed_type_b = ti.types.quant.fixed(bits=5, signed=False, max_value=100.0)
+```
+
+- To define a 6-bit unsigned fixed-point type within [0.0, 64.0]:
+
+```python
+fixed_type_c = ti.types.quant.fixed(bits=6, signed=False, scale=1.0)  # `scale` is a predefined scaling factor
+```
+
+> Set either `scale` or `max_value`, and Taichi works out the other based on your setting. Do *not* set both.
+> `max_value` is a more commonly used parameter, because you may already know the range of the number to represent.
 
 ### Quantized floating-point numbers
 
-[Floating-point numbers](https://en.wikipedia.org/wiki/Floating-point_arithmetic)
-are the standard way to represent real numbers on modern computers. A
-floating-point number is composed of exponent bits, fraction bits, and a sign
-bit. There are various floating-point formats:
+A [floating-point number](https://en.wikipedia.org/wiki/Floating-point_arithmetic) comprises exponent bits, fraction bits, and a sign bit. There are various floating-point formats:
 
 ![image](../static/assets/floating-point_formats.png)
 
-In Taichi, you can define a *quantized floating-point number* with arbitrary
-combination of exponent bits and fraction bits (the sign bit is made part of
-fraction bits):
+Taichi allows you to define a *quantized floating-point number* with an arbitrary combination of exponent bits and fraction bits (the sign bit is made part of the fraction bits).
+
+- To define a 15-bit signed floating-point type with five exponent bits:
 
 ```python
-float_type_a = ti.types.quant.float(exp=5, frac=10)  # 15-bit signed (default) floating-point type with 5 exponent bits
-float_type_b = ti.types.quant.float(exp=6, frac=9, signed=False)  # 15-bit unsigned floating-point type with 6 exponent bits
+float_type_a = ti.types.quant.float(exp=5, frac=10)  # `signed` is set to `True` by default
+```
+
+- To define a 15-bit unsigned floating-point type with six exponent bits:
+
+```python
+float_type_b = ti.types.quant.float(exp=6, frac=9, signed=False)
 ```
 
 ### Compute types
 
-All the parameters you've seen above are specifying the *storage type* of a
-quantized data type. However, most quantized data types have no native support
-on hardware, so an actual value of that quantized data type needs to convert to
-a primitive type ("*compute type*") when it is involved in computation.
+All the above-mentioned parameters specify how a quantized data type is stored in your computer. However, most quantized data types have no native support on hardware, so an actual value of that quantized data type needs to be converted to a primitive type ("*compute type*") during computation.
+
+- The default compute type for quantized integers is `ti.i32`,
+- The default compute type for quantized fixed-point numbers is `ti.f32`,
+- The default compute type for quantized floating-point numbers is `ti.f32`.
 
-The default compute type for quantized integers is `ti.i32`, while the default
-compute type for quantized fixed-point/floating-point numbers is `ti.f32`. You
-can change the compute type by specifying the `compute` parameter:
+To change the compute type of a quantized data type,  set the `compute` parameter when defining the quantized data type:
 
 ```python
 i21 = ti.types.quant.int(bits=21, compute=ti.i64)
@@ -85,9 +100,10 @@ bfloat16 = ti.types.quant.float(exp=8, frac=8, compute=ti.f32)
 
 ## Data containers for quantized data types
 
-Because the storage types are not primitive types, you may now wonder how
-quantized data types can work together with data containers that Taichi
-provides. In fact, some special constructs are introduced to eliminate the gap.
+Quantized data types are not primitive types and hence require the following constructs to work with Taichi's data containers.
+
+- Bitpacked fields
+- Quant arrays
 
 ### Bitpacked fields
 
@@ -230,9 +246,9 @@ def assign_vectorized():
 assign_vectorized()
 ```
 
-## Advanced examples
+## Reference examples
 
-The following examples are picked from the
+The following examples are from the
 [QuanTaichi paper](https://yuanming.taichi.graphics/publication/2021-quantaichi/quantaichi.pdf),
 so you can dig into details there.
 

diff --git a/taichi/analysis/value_diff.cpp b/taichi/analysis/value_diff.cpp
@@ -80,15 +80,6 @@ class ValueDiffLoopIndex : public IRVisitor {
     }
   }
 
-  void visit(ElementShuffleStmt *stmt) override {
-    int old_lane = lane;
-    auto src = stmt->elements[lane].stmt;
-    lane = stmt->elements[lane].index;
-    src->accept(this);
-    results[stmt->instance_id] = results[src->instance_id];
-    lane = old_lane;
-  }
-
   void visit(ConstStmt *stmt) override {
     if (stmt->val[lane].dt->is_primitive(PrimitiveTypeID::i32)) {
       results[stmt->instance_id] = DiffRange(true, 0, stmt->val[lane].val_i32);

diff --git a/taichi/codegen/codegen.cpp b/taichi/codegen/codegen.cpp
@@ -74,7 +74,7 @@ bool KernelCodeGen::maybe_read_compilation_from_cache(
     return false;
   }
   data.swap(cache_data.compiled_data_list);
-  kernel->set_from_offline_cache();
+  kernel->mark_as_from_cache();
   return true;
 }
 

diff --git a/taichi/codegen/cuda/codegen_cuda.cpp b/taichi/codegen/cuda/codegen_cuda.cpp
@@ -87,6 +87,14 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
           value_type = tlctx->get_data_type(PrimitiveType::f64);
           value = builder->CreateFPExt(value, value_type);
         }
+        if (arg_stmt->ret_type->is_primitive(PrimitiveTypeID::i8)) {
+          value_type = tlctx->get_data_type(PrimitiveType::i16);
+          value = builder->CreateSExt(value, value_type);
+        }
+        if (arg_stmt->ret_type->is_primitive(PrimitiveTypeID::u8)) {
+          value_type = tlctx->get_data_type(PrimitiveType::u16);
+          value = builder->CreateZExt(value, value_type);
+        }
 
         types.push_back(value_type);
         values.push_back(value);

diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -130,25 +130,33 @@ void TaskCodeGenLLVM::visit(AllocaStmt *stmt) {
     // Return type is vector<tensor_type>* if use real matrix.
     // otherwise the return type is [type * array_size]*
     if (stmt->is_shared) {
-      size_t data_element_size = tlctx->get_type_size(
-          tlctx->get_data_type(tensor_type->get_element_type()));
-      auto type = llvm::ArrayType::get(
-          llvm::Type::getInt8Ty(*llvm_context),
-          data_element_size * tensor_type->get_num_elements());
+      auto array_type =
+          llvm::ArrayType::get(type, tensor_type->get_num_elements());
       auto base = new llvm::GlobalVariable(
-          *module, type, false, llvm::GlobalValue::ExternalLinkage, nullptr,
-          fmt::format("shared_array_{}", stmt->id), nullptr,
+          *module, array_type, false, llvm::GlobalValue::ExternalLinkage,
+          nullptr, fmt::format("shared_array_{}", stmt->id), nullptr,
           llvm::GlobalVariable::NotThreadLocal, 3 /*addrspace=shared*/);
       base->setAlignment(llvm::MaybeAlign(8));
-
-      auto ptr = builder->CreateGEP(
+      // FIXME: create GEP manually instead of using builder->CreateGEP for
+      // opaque ptr in llvm 15.
+      // If using builder->CreateGEP, it will just return base because all zero
+      // idx.
+      // When opaque ptr is enabled, the CreatePointerCast will only create
+      // address space case instead of bitcast and address space cast. The type
+      // which was kept in bitcast will be lost.
+      // The manually created GEP is usded to keep the type.
+      // Later when lower PtrOffsetStmt, the type should be element type instead
+      // of array_type.
+      // Once llvm type is converted from taichi ir directly when lower
+      // PtrOffsetStmt, we can switch back to builder->CreateGEP.
+      auto *gep = llvm::GetElementPtrInst::CreateInBounds(
 #ifdef TI_LLVM_15
-          base->getValueType(),
+          array_type,
 #endif
           base, {tlctx->get_constant(0), tlctx->get_constant(0)});
-      auto ptr_type = llvm::PointerType::get(
-          tlctx->get_data_type(tensor_type->get_element_type()), 0);
-      llvm_val[stmt] = builder->CreatePointerCast(ptr, ptr_type);
+      builder->Insert(gep);
+      auto ptr_type = llvm::PointerType::get(type, 0);
+      llvm_val[stmt] = builder->CreatePointerCast(gep, ptr_type);
     } else {
       if (kernel->program->config.real_matrix)
         llvm_val[stmt] =
@@ -813,6 +821,12 @@ void TaskCodeGenLLVM::visit(PrintStmt *stmt) {
         dtype->is_primitive(PrimitiveTypeID::f16))
       return this->builder->CreateFPExt(
           to_print, this->tlctx->get_data_type(PrimitiveType::f64));
+    if (dtype->is_primitive(PrimitiveTypeID::i8))
+      return builder->CreateSExt(to_print,
+                                 tlctx->get_data_type(PrimitiveType::i16));
+    if (dtype->is_primitive(PrimitiveTypeID::u8))
+      return builder->CreateZExt(to_print,
+                                 tlctx->get_data_type(PrimitiveType::u16));
     return to_print;
   };
   for (auto const &content : stmt->contents) {
@@ -1508,24 +1522,6 @@ void TaskCodeGenLLVM::visit(GlobalLoadStmt *stmt) {
   create_global_load(stmt, false);
 }
 
-void TaskCodeGenLLVM::visit(ElementShuffleStmt *stmt){
-    TI_NOT_IMPLEMENTED
-    /*
-    auto init = stmt->elements.serialize(
-        [](const VectorElement &elem) {
-          return fmt::format("{}[{}]", elem.stmt->raw_name(), elem.index);
-        },
-        "{");
-    if (stmt->pointer) {
-      emit("{} * const {} [{}] {};", data_type_name(stmt->ret_type),
-           stmt->raw_name(), stmt->width(), init);
-    } else {
-      emit("const {} {} ({});", stmt->ret_data_type_name(), stmt->raw_name(),
-           init);
-    }
-    */
-}
-
 std::string TaskCodeGenLLVM::get_runtime_snode_name(SNode *snode) {
   if (snode->type == SNodeType::root) {
     return "Root";
@@ -1739,6 +1735,8 @@ void TaskCodeGenLLVM::visit(PtrOffsetStmt *stmt) {
       ptr_ty = alloc->getAllocatedType();
     else if (auto *gv = llvm::dyn_cast<llvm::GlobalVariable>(val))
       ptr_ty = gv->getValueType();
+    else if (auto *gep = llvm::dyn_cast<llvm::GEPOperator>(val))
+      ptr_ty = gep->getResultElementType();
     else if (stmt->origin->is<GlobalTemporaryStmt>()) {
       auto *tmpo_stmt = stmt->origin->cast<GlobalTemporaryStmt>();
       if (tmpo_stmt->ret_type->is<TensorType>()) {

diff --git a/taichi/codegen/llvm/codegen_llvm.h b/taichi/codegen/llvm/codegen_llvm.h
@@ -280,8 +280,6 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
   void visit(GlobalLoadStmt *stmt) override;
 
-  void visit(ElementShuffleStmt *stmt) override;
-
   void visit(GetRootStmt *stmt) override;
 
   void visit(BitExtractStmt *stmt) override;

diff --git a/taichi/inc/frontend_statements.inc.h b/taichi/inc/frontend_statements.inc.h
@@ -8,7 +8,6 @@ PER_STATEMENT(FrontendBreakStmt)
 PER_STATEMENT(FrontendContinueStmt)
 PER_STATEMENT(FrontendAllocaStmt)
 PER_STATEMENT(FrontendAssignStmt)
-PER_STATEMENT(FrontendEvalStmt)
 PER_STATEMENT(FrontendSNodeOpStmt)  // activate, deactivate, append, clear
 PER_STATEMENT(FrontendAssertStmt)
 PER_STATEMENT(FrontendFuncDefStmt)

diff --git a/taichi/inc/statements.inc.h b/taichi/inc/statements.inc.h
@@ -59,7 +59,6 @@ PER_STATEMENT(GetChStmt)
 // With per-lane attributes
 PER_STATEMENT(LocalLoadStmt)
 PER_STATEMENT(GlobalPtrStmt)
-PER_STATEMENT(ElementShuffleStmt)
 
 // Offloaded
 PER_STATEMENT(OffloadedStmt)

diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h
@@ -974,28 +974,6 @@ class WhileStmt : public Stmt {
   TI_DEFINE_ACCEPT
 };
 
-// TODO: document for this
-class ElementShuffleStmt : public Stmt {
- public:
-  LaneAttribute<VectorElement> elements;
-  bool pointer;
-
-  explicit ElementShuffleStmt(const LaneAttribute<VectorElement> &elements,
-                              bool pointer = false)
-      : elements(elements), pointer(pointer) {
-    TI_ASSERT(elements.size() == 1);  // TODO: support vectorized cases
-    ret_type = elements[0].stmt->element_type();
-    TI_STMT_REG_FIELDS;
-  }
-
-  bool has_global_side_effect() const override {
-    return false;
-  }
-
-  TI_STMT_DEF_FIELDS(ret_type, elements, pointer);
-  TI_DEFINE_ACCEPT_AND_CLONE
-};
-
 // TODO: remove this (replace with input + ConstStmt(offset))
 class IntegerOffsetStmt : public Stmt {
  public: