Skip to content

Commit

Permalink
Merge branch 'main' into mbahnas/vit_tech_report
Browse files Browse the repository at this point in the history
  • Loading branch information
mbahnasTT authored Sep 19, 2024
2 parents b5bc95a + ea8522c commit 3a6811d
Show file tree
Hide file tree
Showing 591 changed files with 7,979 additions and 2,509 deletions.
84 changes: 52 additions & 32 deletions .github/workflows/fast-dispatch-full-regressions-and-models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,6 @@ jobs:
cmd: tests/scripts/single_card/nightly/run_common_models.sh,
timeout: 40
},
{
name: "Common models N300 WH B0",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N300", "in-service"],
cmd: tests/scripts/single_card/nightly/run_common_models.sh,
timeout: 40,
},
{
name: "Common models N150 WH BO",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N150", "in-service"],
cmd: tests/scripts/single_card/nightly/run_common_models.sh,
timeout: 40,
},
{
name: "GS ttnn nightly",
arch: grayskull,
Expand Down Expand Up @@ -68,40 +54,26 @@ jobs:
cmd: tests/scripts/single_card/nightly/run_gs_only.sh,
timeout: 40
},
{
name: "N300 WH-only models",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N300", "in-service"],
cmd: tests/scripts/single_card/nightly/run_wh_b0_only.sh,
timeout: 100
},
{
name: "N150 WH-only models",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N150", "in-service"],
cmd: tests/scripts/single_card/nightly/run_wh_b0_only.sh,
timeout: 100
},
{
name: "API tests GS",
arch: grayskull,
runs-on: ["cloud-virtual-machine", "E150", "in-service"],
cmd: ./tests/scripts/run_tests.sh --tt-arch grayskull --pipeline-type frequent_api --dispatch-mode fast,
timeout: 40
timeout: 10
},
{
name: "API tests N300 WH B0",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N300", "in-service"],
cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast,
timeout: 40
timeout: 10
},
{
name: "API tests N150 WH B0",
arch: wormhole_b0,
runs-on: ["cloud-virtual-machine", "N150", "in-service"],
cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast,
timeout: 40
timeout: 10
},
{
name: "[Unstable] N150 models",
Expand All @@ -120,7 +92,6 @@ jobs:
]
name: FD ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }}
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.test-group.arch }}
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
Expand Down Expand Up @@ -155,3 +126,52 @@ jobs:
path: |
generated/test_reports/
prefix: "test_reports_"
nightly-wh-models:
needs: build-artifact
strategy:
# Do not fail-fast because we need to ensure all tests go to completion
# so we try not to get hanging machines
fail-fast: false
matrix:
card: [N150, N300]
model: [common_models, functional_unet, llama31_8b, mamba, mistral7b, mistral7b_eth, resnet50]
name: Nightly ${{ matrix.card }} ${{ matrix.model }}
env:
ARCH_NAME: wormhole_b0
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
runs-on: ["cloud-virtual-machine", "in-service", "${{ matrix.card }}"]
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
- uses: ./.github/actions/retry-command
with:
timeout-seconds: 100
max-retries: 10
backoff-seconds: 60
command: ./.github/scripts/cloud_utils/mount_weka.sh
- name: Set up dyanmic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- name: Set up WH_ARCH_YAML for eth-enabled models
if: ${{ matrix.model != 'mistral7b' }}
run: |
echo "WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_wormhole_b0
- name: Extract files
run: tar -xvf ttm_wormhole_b0.tar
- uses: ./.github/actions/install-python-deps
- name: Run frequent reg tests scripts
timeout-minutes: 30
run: |
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
export PYTHONPATH=$TT_METAL_HOME
pytest -n auto tests/nightly/single_card/${{ matrix.model }}
- uses: ./.github/actions/upload-artifact-with-job-uuid
if: ${{ !cancelled() }}
with:
path: |
generated/test_reports/
prefix: "test_reports_"
6 changes: 6 additions & 0 deletions .github/workflows/ttnn-run-sweeps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ on:
- add
- line_all_gather
- logical_and_
- eltwise.subtract.subtract_interleaved
- eltwise.subalpha.subalpha_interleaved
- eltwise.rsub.rsub_interleaved
- eltwise.frac.frac_interleaved
- eltwise.ceil.ceil_interleaved
- eltwise.trunc.trunc_interleaved
- matmul.full.matmul_default_block_sharded
- matmul.full.matmul_default_height_sharded
- matmul.full.matmul_default_interleaved
Expand Down
2 changes: 1 addition & 1 deletion CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ ttnn/cpp/ttnn/operations/ccl/ @SeanNijjar @cfjchu
ttnn/cpp/ttnn/operations/pool/ @mywoodstock @shwetankTT @sankarmanoj-tt @pavlejosipovic
ttnn/cpp/ttnn/operations/conv/ @mywoodstock @shwetankTT @sankarmanoj-tt @pavlejosipovic @bbradelTT
ttnn/cpp/ttnn/operations/sliding_window/ @mywoodstock @sankarmanoj-tt @pavlejosipovic
ttnn/cpp/ttnn/operations/data_movement/ @tarafdarTT @sjameelTT @yan-zaretskiy
ttnn/cpp/ttnn/operations/data_movement/ @tarafdarTT @sjameelTT @yan-zaretskiy @jaykru-tt
ttnn/cpp/ttnn/operations/matmul/ @TT-BrianLiu @bbradelTT @yugaoTT
ttnn/cpp/ttnn/operations/eltwise/ @patrickroberts @yan-zaretskiy @eyonland
ttnn/cpp/ttnn/operations/reduction/ @SeanNijjar @tarafdarTT @sjameelTT
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
| [ResNet-50 (224x224) (data parallel)](./models/demos/tgg/resnet50) | 1024 | [Two Galaxies](https://tenstorrent.com/hardware/galaxy) | 128,800 | 448,000 | |
| [ViT](./models/demos/grayskull/vit) | 9 | [e150](https://tenstorrent.com/hardware/grayskull) | 1,360 | 2,000 | |
| [Stable Diffusion 1.4 (512x512)](./models/demos/wormhole/stable_diffusion) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 0.167 | 0.3 | |
| [Unet (shallow)](./models/experimental/functional_unet) | 2 | [n150](https://tenstorrent.com/hardware/wormhole) | 51 | 1000 | |

## NLPs
| Model | Batch | Hardware | sen/sec | Target sen/sec | Release |
Expand All @@ -65,7 +64,7 @@
For the latest model updates and features, please see [MODEL_UPDATES.md](models/MODEL_UPDATES.md)

## TT-NN Tech Reports
- [Advanced Performance Optimizations for Models](./tech_reports/AdvancedPerformanceOperationsForModels/AdvancedPerformanceOptimizationsForModels.md) (updated Sept 11th)
- [Advanced Performance Optimizations for Models](./tech_reports/AdvancedPerformanceOperationsForModels/AdvancedPerformanceOptimizationsForModels.md) (updated Sept 18th)
- [Programming Mesh of Devices](./tech_reports/Programming%20Mesh%20of%20Devices/Programming%20Mesh%20of%20Devices%20with%20TT-NN.md) (updated Sept 9th)
---

Expand Down
32 changes: 22 additions & 10 deletions build_metal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ show_help() {
echo " -m Enable MemorySanitizer."
echo " -s Enable ThreadSanitizer."
echo " -u Enable UndefinedBehaviorSanitizer."
echo " -p Enable Tracy profiler."
}

# Parse CLI options
Expand All @@ -72,8 +73,9 @@ enable_msan="OFF"
enable_tsan="OFF"
enable_ubsan="OFF"
build_type="Release"
enable_profiler="OFF"

while getopts "hectamsub:" opt; do
while getopts "hectamsub:p" opt; do
case ${opt} in
h )
show_help
Expand Down Expand Up @@ -103,6 +105,9 @@ while getopts "hectamsub:" opt; do
b )
build_type="$OPTARG"
;;
p )
enable_profiler="ON"
;;
\? )
show_help
exit 1
Expand All @@ -125,13 +130,7 @@ echo "Enable MemorySanitizer: $enable_msan"
echo "Enable ThreadSanitizer: $enable_tsan"
echo "Enable UndefinedBehaviorSanitizer: $enable_ubsan"

# Create and link the build directory
mkdir -p build_$build_type
ln -nsf build_$build_type build

# Prepare cmake arguments
# -DCXX_INCLUDE_WHAT_YOU_USE=include-what-you-use
cmake_args="-B build_$build_type -G Ninja -DCMAKE_BUILD_TYPE=$build_type -DCMAKE_EXPORT_COMPILE_COMMANDS=$export_compile_commands"
build_dir="build_$build_type"

if [ "$enable_ccache" = "ON" ]; then
cmake_args="$cmake_args -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache"
Expand All @@ -157,10 +156,23 @@ if [ "$enable_ubsan" = "ON" ]; then
cmake_args="$cmake_args -DENABLE_UBSAN=ON"
fi

if [ "$enable_profiler" = "ON" ]; then
cmake_args="$cmake_args -DENABLE_TRACY=ON"
build_dir="${build_dir}_tracy"
fi

# Create and link the build directory
mkdir -p $build_dir
ln -nsf $build_dir build

# Prepare cmake arguments
# -DCXX_INCLUDE_WHAT_YOU_USE=include-what-you-use
cmake_args="$cmake_args -B $build_dir -G Ninja -DCMAKE_BUILD_TYPE=$build_type -DCMAKE_EXPORT_COMPILE_COMMANDS=$export_compile_commands"

# Configure cmake
cmake $cmake_args

# Build libraries and cpp tests
echo "Building libraries and cpp tests"
cmake --build build_$build_type --target tests # <- Can also just run `ninja tests -C build`
cmake --build build_$build_type --target install # <- This is a general cmake way, can also just run `ninja install -C build`
cmake --build $build_dir --target tests # <- Can also just run `ninja tests -C build`
cmake --build $build_dir --target install # <- This is a general cmake way, can also just run `ninja install -C build`
2 changes: 1 addition & 1 deletion create_venv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ $PYTHON_CMD -m venv $PYTHON_ENV_DIR
source $PYTHON_ENV_DIR/bin/activate

echo "Forcefully using a version of pip that will work with our view of editable installs"
pip install --force-reinstall pip==20.1.1
pip install --force-reinstall pip==21.2.4

echo "Setting up virtual env"
python3 -m pip config set global.extra-index-url https://download.pytorch.org/whl/cpu
Expand Down
8 changes: 4 additions & 4 deletions docs/source/ttnn/ttnn/dependencies/tt_lib.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ New Device Operation
struct <NewOperation> {
void validate(const std::vector<Tensor> &input_tensors) const;
std::vector<Shape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
std::vector<Tensor> create_output_tensors(const std::vector<Tensor> &input_tensors) const;
operation::ProgramWithCallbacks create_program(const std::vector<Tensor>& input_tensors, std::vector<Tensor> &output_tensors) const;
};
Expand All @@ -48,7 +48,7 @@ New Device Operation with a member
int some_member
void validate(const std::vector<Tensor> &input_tensors) const;
std::vector<Shape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
std::vector<Tensor> create_output_tensors(const std::vector<Tensor> &input_tensors) const;
operation::ProgramWithCallbacks create_program(const std::vector<Tensor>& input_tensors, std::vector<Tensor> &output_tensors) const;
};
Expand All @@ -61,7 +61,7 @@ New Device Operation with Optional Input Tensors
struct <NewOperation> {
void validate(const std::vector<Tensor> &input_tensors,
const std::vector<std::optional<const Tensor>>& optional_input_tensors) const;
std::vector<Shape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
std::vector<Tensor> create_output_tensors(const std::vector<Tensor> &input_tensors) const;
operation::ProgramWithCallbacks create_program(
const std::vector<Tensor>& input_tensors,
Expand All @@ -80,7 +80,7 @@ and create_output_tensors with the additional parameter for the output_tensors.
struct <NewOperation> {
void validate_with_output_tensors(const std::vector<Tensor> &input_tensors, const std::vector<std::optional<Tensor>>& output_tensors) const;
std::vector<Shape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
std::vector<std::optional<Tensor>> create_output_tensors(const std::vector<Tensor> &input_tensors, const std::vector<std::optional<Tensor>>& output_tensors) const;
operation::ProgramWithOptionalOutputTensors create_program(const std::vector<Tensor>& input_tensors, std::vector<std::optional<Tensor>> &output_tensors) const;
Expand Down
12 changes: 11 additions & 1 deletion infra/data_collection/cicd.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,22 @@ def create_cicd_json_for_data_analysis(

workflow_outputs_dir = get_workflow_outputs_dir()

github_job_id_to_test_reports = get_github_job_id_to_test_reports(workflow_outputs_dir, github_pipeline_id)
github_job_ids = []
for raw_job in raw_jobs:
github_job_id = int(raw_job["github_job_id"])
github_job_ids.append(github_job_id)

github_job_id_to_test_reports = get_github_job_id_to_test_reports(
workflow_outputs_dir, github_pipeline_id, github_job_ids
)

jobs = []

for raw_job in raw_jobs:
github_job_id = raw_job["github_job_id"]

logger.info(f"Processing raw GitHub job {github_job_id}")

test_report_exists = github_job_id in github_job_id_to_test_reports
if test_report_exists:
test_report_path = github_job_id_to_test_reports[github_job_id]
Expand Down
13 changes: 9 additions & 4 deletions infra/data_collection/github/download_cicd_logs_and_artifacts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,18 @@ download_artifacts() {
download_logs_for_all_jobs() {
local repo=$1
local workflow_run_id=$2
local attempt_number=$3
local max_attempts=$3

echo "[info] downloading logs for job with id $job_id for all attempts up to $max_attempts"
for attempt_number in $(seq 1 $max_attempts); do
echo "[Info] Downloading for attempt $attempt_number"

gh api /repos/$repo/actions/runs/$workflow_run_id/attempts/$attempt_number/jobs --paginate | jq '.jobs[].id' | while read -r job_id; do
echo "[Info] Download logs for job with ID $job_id"
gh api /repos/$repo/actions/jobs/$job_id/logs > generated/cicd/$workflow_run_id/logs/$job_id.log
gh api /repos/$repo/actions/runs/$workflow_run_id/attempts/$attempt_number/jobs --paginate | jq '.jobs[].id' | while read -r job_id; do
echo "[info] download logs for job with id $job_id, attempt number $attempt_number"
gh api /repos/$repo/actions/jobs/$job_id/logs > generated/cicd/$workflow_run_id/logs/$job_id.log
done
done

}

main() {
Expand Down
Loading

0 comments on commit 3a6811d

Please sign in to comment.