Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-16.0] Upgrade-Downgrade Fix: Schema-initialization stuck on semi-sync ACKs while upgrading (#13411) #13441

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 1 addition & 11 deletions .github/workflows/upgrade_downgrade_test_backups_manual.yml
Original file line number Diff line number Diff line change
Expand Up @@ -269,14 +269,6 @@ jobs:
source build.env ; cd examples/backups
./take_backups.sh

# Stopping the tablets so we can perform the upgrade.
- name: Stop tablets
if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true'
timeout-minutes: 10
run: |
source build.env ; cd examples/backups
./stop_tablets.sh

# We upgrade: we swap binaries and use the version N of the tablet.
- name: Upgrade - Swap binaries, use VTTablet N
if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true'
Expand All @@ -293,9 +285,7 @@ jobs:
timeout-minutes: 10
run: |
source build.env ; cd examples/backups
./restart_tablets.sh
# give enough time to the tablets to restore the backup
sleep 90
./upgrade_cluster.sh

# We count the number of rows in every table to check that the restore step was successful.
- name: Assert the number of rows in every table
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,14 +272,6 @@ jobs:
source build.env ; cd examples/backups
./take_backups.sh

# Stopping the tablets so we can perform the upgrade.
- name: Stop tablets
if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true'
timeout-minutes: 10
run: |
source build.env ; cd examples/backups
./stop_tablets.sh

# We upgrade: we swap binaries and use the version N of the tablet.
- name: Upgrade - Swap binaries, use VTTablet N
if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true'
Expand All @@ -296,9 +288,7 @@ jobs:
timeout-minutes: 10
run: |
source build.env ; cd examples/backups
./restart_tablets.sh
# give enough time to the tablets to restore the backup
sleep 90
./upgrade_cluster.sh

# We count the number of rows in every table to check that the restore step was successful.
- name: Assert the number of rows in every table
Expand Down
6 changes: 3 additions & 3 deletions examples/backups/restart_tablets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,6 @@ for i in 101 201 301; do
exit 1
done

vtctldclient InitShardPrimary --force commerce/0 zone1-100
vtctldclient InitShardPrimary --force customer/-80 zone1-200
vtctldclient InitShardPrimary --force customer/80- zone1-300
vtctldclient PlannedReparentShard commerce/0 --new-primary "zone1-100"
vtctldclient PlannedReparentShard customer/-80 --new-primary "zone1-200"
vtctldclient PlannedReparentShard customer/80- --new-primary "zone1-300"
10 changes: 7 additions & 3 deletions examples/backups/start_cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ fi
# start vtctld
CELL=zone1 ../common/scripts/vtctld-up.sh

# Create keyspace and set the semi_sync durability policy.
vtctldclient CreateKeyspace --durability-policy=semi_sync commerce || fail "Failed to create and configure the commerce keyspace"

# start vttablets for keyspace commerce
for i in 100 101 102; do
Expand All @@ -39,12 +41,14 @@ for i in 100 101 102; do
done

# set one of the replicas to primary
vtctldclient InitShardPrimary --force commerce/0 zone1-100
vtctldclient PlannedReparentShard commerce/0 --new-primary "zone1-100"

# create the schema for commerce
vtctlclient ApplySchema -- --sql-file ./create_commerce_schema.sql commerce || fail "Could not apply schema for the commerce keyspace"
vtctlclient ApplyVSchema -- --vschema_file ../local/vschema_commerce_seq.json commerce || fail "Could not apply vschema for the commerce keyspace"

# Create keyspace and set the semi_sync durability policy.
vtctldclient CreateKeyspace --durability-policy=semi_sync customer || fail "Failed to create and configure the customer keyspace"

# start vttablets for keyspace customer
for i in 200 201 202; do
Expand All @@ -57,8 +61,8 @@ for i in 300 301 302; do
done

# set one of the replicas to primary
vtctldclient InitShardPrimary --force customer/-80 zone1-200
vtctldclient InitShardPrimary --force customer/80- zone1-300
vtctldclient PlannedReparentShard customer/-80 --new-primary "zone1-200"
vtctldclient PlannedReparentShard customer/80- --new-primary "zone1-300"

for shard in "-80" "80-"; do
wait_for_healthy_shard customer "${shard}" || exit 1
Expand Down
97 changes: 97 additions & 0 deletions examples/backups/upgrade_cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/bin/bash

# Copyright 2023 The Vitess Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# this script brings up new tablets for the two new shards that we will
# be creating in the customer keyspace and copies the schema

source ../common/env.sh

# Restart the replica tablets so that they come up with new vttablet versions
for i in 101 102; do
echo "Shutting down tablet zone1-$i"
CELL=zone1 TABLET_UID=$i ../common/scripts/vttablet-down.sh
echo "Shutting down mysql zone1-$i"
CELL=zone1 TABLET_UID=$i ../common/scripts/mysqlctl-down.sh
echo "Removing tablet directory zone1-$i"
vtctlclient DeleteTablet -- --allow_primary=true zone1-$i
rm -Rf $VTDATAROOT/vt_0000000$i
echo "Starting tablet zone1-$i again"
CELL=zone1 TABLET_UID=$i ../common/scripts/mysqlctl-up.sh
CELL=zone1 KEYSPACE=commerce TABLET_UID=$i ../common/scripts/vttablet-up.sh
done

for i in 201 202; do
echo "Shutting down tablet zone1-$i"
CELL=zone1 TABLET_UID=$i ../common/scripts/vttablet-down.sh
echo "Shutting down mysql zone1-$i"
CELL=zone1 TABLET_UID=$i ../common/scripts/mysqlctl-down.sh
echo "Removing tablet directory zone1-$i"
vtctlclient DeleteTablet -- --allow_primary=true zone1-$i
rm -Rf $VTDATAROOT/vt_0000000$i
echo "Starting tablet zone1-$i again"
CELL=zone1 TABLET_UID=$i ../common/scripts/mysqlctl-up.sh
SHARD=-80 CELL=zone1 KEYSPACE=customer TABLET_UID=$i ../common/scripts/vttablet-up.sh
done

for i in 301 302; do
echo "Shutting down tablet zone1-$i"
CELL=zone1 TABLET_UID=$i ../common/scripts/vttablet-down.sh
echo "Shutting down mysql zone1-$i"
CELL=zone1 TABLET_UID=$i ../common/scripts/mysqlctl-down.sh
echo "Removing tablet directory zone1-$i"
vtctlclient DeleteTablet -- --allow_primary=true zone1-$i
rm -Rf $VTDATAROOT/vt_0000000$i
echo "Starting tablet zone1-$i again"
CELL=zone1 TABLET_UID=$i ../common/scripts/mysqlctl-up.sh
SHARD=80- CELL=zone1 KEYSPACE=customer TABLET_UID=$i ../common/scripts/vttablet-up.sh
done

# Wait for all the replica tablets to be in the serving state before reparenting to them.
totalTime=600
for i in 101 201 301; do
while [ $totalTime -gt 0 ]; do
status=$(curl "http://$hostname:15$i/debug/status_details")
echo "$status" | grep "REPLICA: Serving" && break
totalTime=$((totalTime-1))
sleep 0.1
done
done

# Check that all the replica tablets have reached REPLICA: Serving state
for i in 101 201 301; do
status=$(curl "http://$hostname:15$i/debug/status_details")
echo "$status" | grep "REPLICA: Serving" && continue
echo "tablet-$i did not reach REPLICA: Serving state. Exiting due to failure."
exit 1
done

# Promote the replica tablets to primary
vtctldclient PlannedReparentShard commerce/0 --new-primary "zone1-101"
vtctldclient PlannedReparentShard customer/-80 --new-primary "zone1-201"
vtctldclient PlannedReparentShard customer/80- --new-primary "zone1-301"

# Restart the old primary tablets so that they are on the latest version of vttablet too.
echo "Restarting tablet zone1-100"
CELL=zone1 TABLET_UID=100 ../common/scripts/vttablet-down.sh
CELL=zone1 KEYSPACE=commerce TABLET_UID=100 ../common/scripts/vttablet-up.sh

echo "Restarting tablet zone1-200"
CELL=zone1 TABLET_UID=200 ../common/scripts/vttablet-down.sh
SHARD=-80 CELL=zone1 KEYSPACE=customer TABLET_UID=200 ../common/scripts/vttablet-up.sh

echo "Restarting tablet zone1-300"
CELL=zone1 TABLET_UID=300 ../common/scripts/vttablet-down.sh
SHARD=80- CELL=zone1 KEYSPACE=customer TABLET_UID=300 ../common/scripts/vttablet-up.sh
1 change: 0 additions & 1 deletion examples/common/scripts/vttablet-up.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ vttablet \
--service_map 'grpc-queryservice,grpc-tabletmanager,grpc-updatestream' \
--pid_file $VTDATAROOT/$tablet_dir/vttablet.pid \
--vtctld_addr http://$hostname:$vtctld_web_port/ \
--disable_active_reparents \
> $VTDATAROOT/$tablet_dir/vttablet.out 2>&1 &

# Block waiting for the tablet to be listening
Expand Down
Loading