Skip to content

Commit

Permalink
Add POWER8 VSX toolchains (#4853)
Browse files Browse the repository at this point in the history
* Add POWER8 VSX toolchains

POWER8, though slower than POWER9, is still used in the wild; these
toolchains should still be much faster on POWER8 than POWER8 without VSX
optimizations.

* VSX toolchains: set -cpu arg in QEMU CI tests
  • Loading branch information
JeremyRand committed Aug 6, 2023
1 parent 4c861a0 commit 0a8cf31
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 7 deletions.
48 changes: 47 additions & 1 deletion .github/workflows/linux-ppc64-cpu-gcc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,52 @@ jobs:
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu" ctest --output-on-failure -j 2
linux-gcc-power8le-vsx:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v3

- name: cache-qemu
id: cache-qemu
uses: actions/cache@v3
with:
path: qemu-install
key: qemu-ppc64le-install-20220502-2
- name: install-qemu-build-deps
if: steps.cache-qemu.outputs.cache-hit != 'true'
run: |
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build
- name: checkout-qemu
if: steps.cache-qemu.outputs.cache-hit != 'true'
uses: actions/checkout@v3
with:
repository: qemu/qemu
path: qemu
ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
- name: qemu
if: steps.cache-qemu.outputs.cache-hit != 'true'
run: |
cd qemu
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=ppc64le-linux-user --disable-system
make -j2
make install
- name: powerpc64le-gnu-toolchain
run: |
sudo apt-get update
sudo apt-get install g++-powerpc64le-linux-gnu
- name: configure
run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/power8le-linux-gnu-vsx.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- name: build
run: cmake --build build -j 2

- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu;-cpu;power8_v2.0" ctest --output-on-failure -j 2
linux-gcc-power9le-vsx:
runs-on: ubuntu-20.04
steps:
Expand Down Expand Up @@ -118,4 +164,4 @@ jobs:
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu" ctest --output-on-failure -j 2
TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu;-cpu;power9_v2.0" ctest --output-on-failure -j 2
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,9 +203,9 @@ ncnn 目前已在腾讯多款应用中使用,如:QQ,Qzone,微信,天

## HowTo

**[how to build ncnn library](https://github.com/Tencent/ncnn/wiki/how-to-build) on Linux / Windows / macOS / Raspberry Pi3, Pi4 / Android / NVIDIA Jetson / iOS / WebAssembly / AllWinner D1 / Loongson 2K1000**
**[how to build ncnn library](https://github.com/Tencent/ncnn/wiki/how-to-build) on Linux / Windows / macOS / Raspberry Pi3, Pi4 / POWER / Android / NVIDIA Jetson / iOS / WebAssembly / AllWinner D1 / Loongson 2K1000**

- [Build for Linux / NVIDIA Jetson / Raspberry Pi3, Pi4 / POWER9](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-linux)
- [Build for Linux / NVIDIA Jetson / Raspberry Pi3, Pi4 / POWER](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-linux)
- [Build for Windows x64 using VS2017](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-windows-x64-using-visual-studio-community-2017)
- [Build for macOS](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-macos)
- [Build for ARM Cortex-A family with cross-compiling](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-arm-cortex-a-family-with-cross-compiling)
Expand Down
10 changes: 6 additions & 4 deletions docs/how-to-build/how-to-build.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ git submodule update --init
- [Build for Linux](#build-for-linux)
- [Nvidia Jetson](#nvidia-jetson)
- [Raspberry Pi](#raspberry-pi)
- [POWER9](#power9)
- [POWER](#power)
- [Intel oneAPI](#intel-oneapi)
- [Verification](#verification)
- [Build for Windows x64 using Visual Studio Community 2017](#build-for-windows-x64-using-visual-studio-community-2017)
Expand Down Expand Up @@ -89,9 +89,9 @@ You can add `-GNinja` to `cmake` above to use Ninja build system (invoke build u

For Rasberry Pi 3 on 32bit OS, add `-DCMAKE_TOOLCHAIN_FILE=../toolchains/pi3.toolchain.cmake` to cmake. You can also consider disabling Vulkan support as the Vulkan drivers for Rasberry Pi are still not mature, but it doesn't hurt to build the support in, but not use it.

#### POWER9
#### POWER

With Clang 13 or higher:
For POWER9 with Clang 13 or higher:

```shell
cd ncnn
Expand All @@ -103,7 +103,9 @@ make -j$(nproc)

Earlier versions of Clang may fail to build ncnn due to [Bug 49864](https://github.com/llvm/llvm-project/issues/49864). To use GCC instead, use the `power9le-linux-gnu-vsx.toolchain.cmake` toolchain file instead. Note that according to benchmarks, Clang appears to produce noticeably faster CPU inference than GCC for POWER9 targets.

Note that the POWER9 toolchain files only support little-endian mode.
For POWER8 instead of POWER9, use the `power8le-linux-gnu-vsx.clang.toolchain.cmake` or `power8le-linux-gnu-vsx.toolchain.cmake` toolchain file instead. POWER8 will be slower than POWER9.

Note that the POWER toolchain files only support little-endian mode.

#### Intel oneAPI

Expand Down
19 changes: 19 additions & 0 deletions toolchains/power8le-linux-gnu-vsx.clang.toolchain.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR powerpc64le)

set(CMAKE_C_COMPILER "clang")
set(CMAKE_CXX_COMPILER "clang++")

set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)

set(CMAKE_C_FLAGS "-target powerpc64le-linux-gnu -I/usr/powerpc64le-linux-gnu/include -mcpu=power8 -mtune=power8 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSSE3__")
set(CMAKE_CXX_FLAGS "-target powerpc64le-linux-gnu -I/usr/powerpc64le-linux-gnu/include -I/usr/powerpc64le-linux-gnu/include/c++/10/powerpc64le-linux-gnu -mcpu=power8 -mtune=power8 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSSE3__")

# cache flags
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")

# Auto-translate SSE to VSX
set(NCNN_PPC64LE_VSX ON)
19 changes: 19 additions & 0 deletions toolchains/power8le-linux-gnu-vsx.toolchain.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR powerpc64le)

set(CMAKE_C_COMPILER "powerpc64le-linux-gnu-gcc")
set(CMAKE_CXX_COMPILER "powerpc64le-linux-gnu-g++")

set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)

set(CMAKE_C_FLAGS "-mcpu=power8 -mtune=power8 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSSE3__")
set(CMAKE_CXX_FLAGS "-mcpu=power8 -mtune=power8 -DNO_WARN_X86_INTRINSICS -D__MMX__ -D__SSE__ -D__SSSE3__")

# cache flags
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")

# Auto-translate SSE to VSX
set(NCNN_PPC64LE_VSX ON)

0 comments on commit 0a8cf31

Please sign in to comment.